Initial push to gitea
This commit is contained in:
49
internal/transcribe/segments.go
Normal file
49
internal/transcribe/segments.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package transcribe
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Segment is one timestamped chunk of a transcript.
|
||||
type Segment struct {
|
||||
Start float64 // seconds from start of audio
|
||||
End float64
|
||||
Text string
|
||||
}
|
||||
|
||||
// PlainText joins all segments into a single transcript.
|
||||
func PlainText(segs []Segment) string {
|
||||
var b strings.Builder
|
||||
for _, s := range segs {
|
||||
b.WriteString(strings.TrimSpace(s.Text))
|
||||
b.WriteByte(' ')
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// FormatForLLM renders segments as one timestamped line each, suitable for
|
||||
// feeding to a model that needs to pick a time window.
|
||||
//
|
||||
// [mm:ss] [mm:ss] text
|
||||
func FormatForLLM(segs []Segment) string {
|
||||
var b strings.Builder
|
||||
for _, s := range segs {
|
||||
fmt.Fprintf(&b, "[%s] [%s] %s\n", formatTS(s.Start), formatTS(s.End), strings.TrimSpace(s.Text))
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func formatTS(seconds float64) string {
|
||||
if seconds < 0 {
|
||||
seconds = 0
|
||||
}
|
||||
total := int(seconds)
|
||||
h := total / 3600
|
||||
m := (total % 3600) / 60
|
||||
s := total % 60
|
||||
if h > 0 {
|
||||
return fmt.Sprintf("%02d:%02d:%02d", h, m, s)
|
||||
}
|
||||
return fmt.Sprintf("%02d:%02d", m, s)
|
||||
}
|
||||
10
internal/transcribe/transcribe.go
Normal file
10
internal/transcribe/transcribe.go
Normal file
@@ -0,0 +1,10 @@
|
||||
// Package transcribe converts a normalized WAV file into plain-text transcript.
|
||||
package transcribe
|
||||
|
||||
import "context"
|
||||
|
||||
// Transcriber turns a 16kHz mono WAV at wavPath into a plaintext transcript.
|
||||
type Transcriber interface {
|
||||
Transcribe(ctx context.Context, wavPath string) (string, error)
|
||||
Name() string
|
||||
}
|
||||
213
internal/transcribe/whispercpp.go
Normal file
213
internal/transcribe/whispercpp.go
Normal file
@@ -0,0 +1,213 @@
|
||||
package transcribe
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// WhisperCPP shells out to a whisper.cpp CLI binary (whisper-cli, whisper-cpp,
|
||||
// or legacy `main`) and reads its `-otxt` output. The binary must produce a
|
||||
// .txt file next to the requested output basename.
|
||||
type WhisperCPP struct {
|
||||
// Bin is the whisper.cpp binary name or absolute path.
|
||||
Bin string
|
||||
// Model is the path to a ggml whisper model (.bin).
|
||||
Model string
|
||||
// Language to force; empty means auto-detect.
|
||||
Language string
|
||||
// Threads to use; 0 lets whisper.cpp pick.
|
||||
Threads int
|
||||
// ExtraArgs are appended to the command verbatim.
|
||||
ExtraArgs []string
|
||||
// Verbose enables per-step diagnostic logging to stderr (which probe ran,
|
||||
// which backend was selected, etc.). The selected backend is always logged
|
||||
// on a single stderr line regardless of this flag.
|
||||
Verbose bool
|
||||
}
|
||||
|
||||
func (w *WhisperCPP) Name() string { return "whisper.cpp" }
|
||||
|
||||
func (w *WhisperCPP) Transcribe(ctx context.Context, wavPath string) (string, error) {
|
||||
segs, err := w.TranscribeSegments(ctx, wavPath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return PlainText(segs), nil
|
||||
}
|
||||
|
||||
// TranscribeSegments runs whisper.cpp with JSON output and returns the
|
||||
// per-segment timestamps (in seconds) and text.
|
||||
func (w *WhisperCPP) TranscribeSegments(ctx context.Context, wavPath string) ([]Segment, error) {
|
||||
bin, err := w.resolveBin()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if w.Model == "" {
|
||||
return nil, fmt.Errorf("whisper.cpp model path is required (--whisper-model)")
|
||||
}
|
||||
if _, err := os.Stat(w.Model); err != nil {
|
||||
return nil, fmt.Errorf("whisper model not readable at %s: %w", w.Model, err)
|
||||
}
|
||||
|
||||
dir := filepath.Dir(wavPath)
|
||||
base := strings.TrimSuffix(filepath.Base(wavPath), filepath.Ext(wavPath))
|
||||
outBase := filepath.Join(dir, base)
|
||||
jsonPath := outBase + ".json"
|
||||
_ = os.Remove(jsonPath)
|
||||
|
||||
args := []string{
|
||||
"-m", w.Model,
|
||||
"-f", wavPath,
|
||||
"-oj",
|
||||
"-of", outBase,
|
||||
"--no-prints",
|
||||
}
|
||||
if w.Language != "" {
|
||||
args = append(args, "-l", w.Language)
|
||||
}
|
||||
if w.Threads > 0 {
|
||||
args = append(args, "-t", fmt.Sprintf("%d", w.Threads))
|
||||
}
|
||||
args = append(args, w.ExtraArgs...)
|
||||
|
||||
cmd := exec.CommandContext(ctx, bin, args...)
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("%s: %w", bin, err)
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(jsonPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading whisper json %s: %w", jsonPath, err)
|
||||
}
|
||||
return parseWhisperJSON(data)
|
||||
}
|
||||
|
||||
// gpuBackend describes one accelerated whisper.cpp build we may pick at
|
||||
// runtime. The binary is conventionally installed at ~/.local/bin/<bin> (or
|
||||
// anywhere on PATH); the probe is a fast command that exits 0 only when the
|
||||
// matching GPU runtime is actually usable on this machine.
|
||||
type gpuBackend struct {
|
||||
name string
|
||||
bin string
|
||||
probe []string
|
||||
}
|
||||
|
||||
var gpuBackends = []gpuBackend{
|
||||
{"CUDA", "whisper-cli-cuda", []string{"nvidia-smi", "-L"}},
|
||||
{"ROCm", "whisper-cli-rocm", []string{"rocminfo"}},
|
||||
{"Vulkan", "whisper-cli-vulkan", []string{"vulkaninfo", "--summary"}},
|
||||
}
|
||||
|
||||
func (w *WhisperCPP) resolveBin() (string, error) {
|
||||
if w.Bin != "" {
|
||||
if _, err := exec.LookPath(w.Bin); err == nil {
|
||||
return w.Bin, nil
|
||||
}
|
||||
if _, err := os.Stat(w.Bin); err == nil {
|
||||
return w.Bin, nil
|
||||
}
|
||||
return "", fmt.Errorf("whisper.cpp binary %q not found on PATH", w.Bin)
|
||||
}
|
||||
|
||||
// Metal is always usable on macOS — no separate probe needed; if the
|
||||
// binary exists we trust it.
|
||||
if runtime.GOOS == "darwin" {
|
||||
if path := findBinary("whisper-cli-metal"); path != "" {
|
||||
fmt.Fprintf(os.Stderr, "whisper: using Metal backend (%s)\n", path)
|
||||
return path, nil
|
||||
}
|
||||
}
|
||||
|
||||
for _, b := range gpuBackends {
|
||||
path := findBinary(b.bin)
|
||||
if path == "" {
|
||||
if w.Verbose {
|
||||
fmt.Fprintf(os.Stderr, "whisper: no %s binary (%s) installed; skipping\n", b.name, b.bin)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !probeSucceeds(b.probe) {
|
||||
if w.Verbose {
|
||||
fmt.Fprintf(os.Stderr, "whisper: %s binary present at %s but %s probe failed; trying next\n", b.name, path, b.probe[0])
|
||||
}
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "whisper: using %s backend (%s)\n", b.name, path)
|
||||
return path, nil
|
||||
}
|
||||
|
||||
for _, alt := range []string{"whisper-cli", "whisper-cpp", "main"} {
|
||||
if path, e := exec.LookPath(alt); e == nil {
|
||||
fmt.Fprintf(os.Stderr, "whisper: using CPU backend (%s)\n", path)
|
||||
return path, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no whisper.cpp binary found (tried GPU builds whisper-cli-{cuda,rocm,vulkan} in ~/.local/bin and PATH, then CPU whisper-cli/whisper-cpp/main on PATH); pass --whisper-bin")
|
||||
}
|
||||
|
||||
// findBinary looks for an executable first in ~/.local/bin (the convention
|
||||
// for hand-built backends), then on PATH. Returns "" if neither has it.
|
||||
func findBinary(name string) string {
|
||||
if home, err := os.UserHomeDir(); err == nil {
|
||||
candidate := filepath.Join(home, ".local", "bin", name)
|
||||
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
if path, err := exec.LookPath(name); err == nil {
|
||||
return path
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// probeSucceeds runs the probe with a short timeout and reports whether it
|
||||
// exited 0. Used to confirm the GPU runtime is actually usable before we
|
||||
// commit to its whisper-cli build.
|
||||
func probeSucceeds(argv []string) bool {
|
||||
if _, err := exec.LookPath(argv[0]); err != nil {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
|
||||
return cmd.Run() == nil
|
||||
}
|
||||
|
||||
// whisperJSONFile mirrors the structure whisper.cpp writes with -oj.
|
||||
type whisperJSONFile struct {
|
||||
Transcription []struct {
|
||||
Offsets struct {
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
} `json:"offsets"`
|
||||
Text string `json:"text"`
|
||||
} `json:"transcription"`
|
||||
}
|
||||
|
||||
func parseWhisperJSON(data []byte) ([]Segment, error) {
|
||||
var f whisperJSONFile
|
||||
if err := json.Unmarshal(data, &f); err != nil {
|
||||
return nil, fmt.Errorf("parsing whisper JSON: %w", err)
|
||||
}
|
||||
if len(f.Transcription) == 0 {
|
||||
return nil, fmt.Errorf("whisper produced no transcription segments")
|
||||
}
|
||||
out := make([]Segment, 0, len(f.Transcription))
|
||||
for _, s := range f.Transcription {
|
||||
out = append(out, Segment{
|
||||
Start: float64(s.Offsets.From) / 1000.0,
|
||||
End: float64(s.Offsets.To) / 1000.0,
|
||||
Text: s.Text,
|
||||
})
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
Reference in New Issue
Block a user