package transcribe import ( "context" "encoding/json" "fmt" "os" "os/exec" "path/filepath" "runtime" "strings" "time" ) // WhisperCPP shells out to a whisper.cpp CLI binary (whisper-cli, whisper-cpp, // or legacy `main`) and reads its `-otxt` output. The binary must produce a // .txt file next to the requested output basename. type WhisperCPP struct { // Bin is the whisper.cpp binary name or absolute path. Bin string // Model is the path to a ggml whisper model (.bin). Model string // Language to force; empty means auto-detect. Language string // Threads to use; 0 lets whisper.cpp pick. Threads int // ExtraArgs are appended to the command verbatim. ExtraArgs []string // Verbose enables per-step diagnostic logging to stderr (which probe ran, // which backend was selected, etc.). The selected backend is always logged // on a single stderr line regardless of this flag. Verbose bool } func (w *WhisperCPP) Name() string { return "whisper.cpp" } func (w *WhisperCPP) Transcribe(ctx context.Context, wavPath string) (string, error) { segs, err := w.TranscribeSegments(ctx, wavPath) if err != nil { return "", err } return PlainText(segs), nil } // TranscribeSegments runs whisper.cpp with JSON output and returns the // per-segment timestamps (in seconds) and text. func (w *WhisperCPP) TranscribeSegments(ctx context.Context, wavPath string) ([]Segment, error) { bin, err := w.resolveBin() if err != nil { return nil, err } if w.Model == "" { return nil, fmt.Errorf("whisper.cpp model path is required (--whisper-model)") } if _, err := os.Stat(w.Model); err != nil { return nil, fmt.Errorf("whisper model not readable at %s: %w", w.Model, err) } dir := filepath.Dir(wavPath) base := strings.TrimSuffix(filepath.Base(wavPath), filepath.Ext(wavPath)) outBase := filepath.Join(dir, base) jsonPath := outBase + ".json" _ = os.Remove(jsonPath) args := []string{ "-m", w.Model, "-f", wavPath, "-oj", "-of", outBase, "--no-prints", } if w.Language != "" { args = append(args, "-l", w.Language) } if w.Threads > 0 { args = append(args, "-t", fmt.Sprintf("%d", w.Threads)) } args = append(args, w.ExtraArgs...) cmd := exec.CommandContext(ctx, bin, args...) cmd.Stdout = os.Stderr cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { return nil, fmt.Errorf("%s: %w", bin, err) } data, err := os.ReadFile(jsonPath) if err != nil { return nil, fmt.Errorf("reading whisper json %s: %w", jsonPath, err) } return parseWhisperJSON(data) } // gpuBackend describes one accelerated whisper.cpp build we may pick at // runtime. The binary is conventionally installed at ~/.local/bin/ (or // anywhere on PATH); the probe is a fast command that exits 0 only when the // matching GPU runtime is actually usable on this machine. type gpuBackend struct { name string bin string probe []string } var gpuBackends = []gpuBackend{ {"CUDA", "whisper-cli-cuda", []string{"nvidia-smi", "-L"}}, {"ROCm", "whisper-cli-rocm", []string{"rocminfo"}}, {"Vulkan", "whisper-cli-vulkan", []string{"vulkaninfo", "--summary"}}, } func (w *WhisperCPP) resolveBin() (string, error) { if w.Bin != "" { if _, err := exec.LookPath(w.Bin); err == nil { return w.Bin, nil } if _, err := os.Stat(w.Bin); err == nil { return w.Bin, nil } return "", fmt.Errorf("whisper.cpp binary %q not found on PATH", w.Bin) } // Metal is always usable on macOS — no separate probe needed; if the // binary exists we trust it. if runtime.GOOS == "darwin" { if path := findBinary("whisper-cli-metal"); path != "" { fmt.Fprintf(os.Stderr, "whisper: using Metal backend (%s)\n", path) return path, nil } } for _, b := range gpuBackends { path := findBinary(b.bin) if path == "" { if w.Verbose { fmt.Fprintf(os.Stderr, "whisper: no %s binary (%s) installed; skipping\n", b.name, b.bin) } continue } if !probeSucceeds(b.probe) { if w.Verbose { fmt.Fprintf(os.Stderr, "whisper: %s binary present at %s but %s probe failed; trying next\n", b.name, path, b.probe[0]) } continue } fmt.Fprintf(os.Stderr, "whisper: using %s backend (%s)\n", b.name, path) return path, nil } for _, alt := range []string{"whisper-cli", "whisper-cpp", "main"} { if path, e := exec.LookPath(alt); e == nil { fmt.Fprintf(os.Stderr, "whisper: using CPU backend (%s)\n", path) return path, nil } } return "", fmt.Errorf("no whisper.cpp binary found (tried GPU builds whisper-cli-{cuda,rocm,vulkan} in ~/.local/bin and PATH, then CPU whisper-cli/whisper-cpp/main on PATH); pass --whisper-bin") } // findBinary looks for an executable first in ~/.local/bin (the convention // for hand-built backends), then on PATH. Returns "" if neither has it. func findBinary(name string) string { if home, err := os.UserHomeDir(); err == nil { candidate := filepath.Join(home, ".local", "bin", name) if info, err := os.Stat(candidate); err == nil && !info.IsDir() { return candidate } } if path, err := exec.LookPath(name); err == nil { return path } return "" } // probeSucceeds runs the probe with a short timeout and reports whether it // exited 0. Used to confirm the GPU runtime is actually usable before we // commit to its whisper-cli build. func probeSucceeds(argv []string) bool { if _, err := exec.LookPath(argv[0]); err != nil { return false } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() cmd := exec.CommandContext(ctx, argv[0], argv[1:]...) return cmd.Run() == nil } // whisperJSONFile mirrors the structure whisper.cpp writes with -oj. type whisperJSONFile struct { Transcription []struct { Offsets struct { From int64 `json:"from"` To int64 `json:"to"` } `json:"offsets"` Text string `json:"text"` } `json:"transcription"` } func parseWhisperJSON(data []byte) ([]Segment, error) { var f whisperJSONFile if err := json.Unmarshal(data, &f); err != nil { return nil, fmt.Errorf("parsing whisper JSON: %w", err) } if len(f.Transcription) == 0 { return nil, fmt.Errorf("whisper produced no transcription segments") } out := make([]Segment, 0, len(f.Transcription)) for _, s := range f.Transcription { out = append(out, Segment{ Start: float64(s.Offsets.From) / 1000.0, End: float64(s.Offsets.To) / 1000.0, Text: s.Text, }) } return out, nil }