Files
Summerize/internal/transcribe/whispercpp.go
2026-05-10 13:37:17 -06:00

214 lines
6.2 KiB
Go

package transcribe
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"time"
)
// WhisperCPP shells out to a whisper.cpp CLI binary (whisper-cli, whisper-cpp,
// or legacy `main`) and reads its `-otxt` output. The binary must produce a
// .txt file next to the requested output basename.
type WhisperCPP struct {
// Bin is the whisper.cpp binary name or absolute path.
Bin string
// Model is the path to a ggml whisper model (.bin).
Model string
// Language to force; empty means auto-detect.
Language string
// Threads to use; 0 lets whisper.cpp pick.
Threads int
// ExtraArgs are appended to the command verbatim.
ExtraArgs []string
// Verbose enables per-step diagnostic logging to stderr (which probe ran,
// which backend was selected, etc.). The selected backend is always logged
// on a single stderr line regardless of this flag.
Verbose bool
}
func (w *WhisperCPP) Name() string { return "whisper.cpp" }
func (w *WhisperCPP) Transcribe(ctx context.Context, wavPath string) (string, error) {
segs, err := w.TranscribeSegments(ctx, wavPath)
if err != nil {
return "", err
}
return PlainText(segs), nil
}
// TranscribeSegments runs whisper.cpp with JSON output and returns the
// per-segment timestamps (in seconds) and text.
func (w *WhisperCPP) TranscribeSegments(ctx context.Context, wavPath string) ([]Segment, error) {
bin, err := w.resolveBin()
if err != nil {
return nil, err
}
if w.Model == "" {
return nil, fmt.Errorf("whisper.cpp model path is required (--whisper-model)")
}
if _, err := os.Stat(w.Model); err != nil {
return nil, fmt.Errorf("whisper model not readable at %s: %w", w.Model, err)
}
dir := filepath.Dir(wavPath)
base := strings.TrimSuffix(filepath.Base(wavPath), filepath.Ext(wavPath))
outBase := filepath.Join(dir, base)
jsonPath := outBase + ".json"
_ = os.Remove(jsonPath)
args := []string{
"-m", w.Model,
"-f", wavPath,
"-oj",
"-of", outBase,
"--no-prints",
}
if w.Language != "" {
args = append(args, "-l", w.Language)
}
if w.Threads > 0 {
args = append(args, "-t", fmt.Sprintf("%d", w.Threads))
}
args = append(args, w.ExtraArgs...)
cmd := exec.CommandContext(ctx, bin, args...)
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("%s: %w", bin, err)
}
data, err := os.ReadFile(jsonPath)
if err != nil {
return nil, fmt.Errorf("reading whisper json %s: %w", jsonPath, err)
}
return parseWhisperJSON(data)
}
// gpuBackend describes one accelerated whisper.cpp build we may pick at
// runtime. The binary is conventionally installed at ~/.local/bin/<bin> (or
// anywhere on PATH); the probe is a fast command that exits 0 only when the
// matching GPU runtime is actually usable on this machine.
type gpuBackend struct {
name string
bin string
probe []string
}
var gpuBackends = []gpuBackend{
{"CUDA", "whisper-cli-cuda", []string{"nvidia-smi", "-L"}},
{"ROCm", "whisper-cli-rocm", []string{"rocminfo"}},
{"Vulkan", "whisper-cli-vulkan", []string{"vulkaninfo", "--summary"}},
}
func (w *WhisperCPP) resolveBin() (string, error) {
if w.Bin != "" {
if _, err := exec.LookPath(w.Bin); err == nil {
return w.Bin, nil
}
if _, err := os.Stat(w.Bin); err == nil {
return w.Bin, nil
}
return "", fmt.Errorf("whisper.cpp binary %q not found on PATH", w.Bin)
}
// Metal is always usable on macOS — no separate probe needed; if the
// binary exists we trust it.
if runtime.GOOS == "darwin" {
if path := findBinary("whisper-cli-metal"); path != "" {
fmt.Fprintf(os.Stderr, "whisper: using Metal backend (%s)\n", path)
return path, nil
}
}
for _, b := range gpuBackends {
path := findBinary(b.bin)
if path == "" {
if w.Verbose {
fmt.Fprintf(os.Stderr, "whisper: no %s binary (%s) installed; skipping\n", b.name, b.bin)
}
continue
}
if !probeSucceeds(b.probe) {
if w.Verbose {
fmt.Fprintf(os.Stderr, "whisper: %s binary present at %s but %s probe failed; trying next\n", b.name, path, b.probe[0])
}
continue
}
fmt.Fprintf(os.Stderr, "whisper: using %s backend (%s)\n", b.name, path)
return path, nil
}
for _, alt := range []string{"whisper-cli", "whisper-cpp", "main"} {
if path, e := exec.LookPath(alt); e == nil {
fmt.Fprintf(os.Stderr, "whisper: using CPU backend (%s)\n", path)
return path, nil
}
}
return "", fmt.Errorf("no whisper.cpp binary found (tried GPU builds whisper-cli-{cuda,rocm,vulkan} in ~/.local/bin and PATH, then CPU whisper-cli/whisper-cpp/main on PATH); pass --whisper-bin")
}
// findBinary looks for an executable first in ~/.local/bin (the convention
// for hand-built backends), then on PATH. Returns "" if neither has it.
func findBinary(name string) string {
if home, err := os.UserHomeDir(); err == nil {
candidate := filepath.Join(home, ".local", "bin", name)
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
return candidate
}
}
if path, err := exec.LookPath(name); err == nil {
return path
}
return ""
}
// probeSucceeds runs the probe with a short timeout and reports whether it
// exited 0. Used to confirm the GPU runtime is actually usable before we
// commit to its whisper-cli build.
func probeSucceeds(argv []string) bool {
if _, err := exec.LookPath(argv[0]); err != nil {
return false
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
return cmd.Run() == nil
}
// whisperJSONFile mirrors the structure whisper.cpp writes with -oj.
type whisperJSONFile struct {
Transcription []struct {
Offsets struct {
From int64 `json:"from"`
To int64 `json:"to"`
} `json:"offsets"`
Text string `json:"text"`
} `json:"transcription"`
}
func parseWhisperJSON(data []byte) ([]Segment, error) {
var f whisperJSONFile
if err := json.Unmarshal(data, &f); err != nil {
return nil, fmt.Errorf("parsing whisper JSON: %w", err)
}
if len(f.Transcription) == 0 {
return nil, fmt.Errorf("whisper produced no transcription segments")
}
out := make([]Segment, 0, len(f.Transcription))
for _, s := range f.Transcription {
out = append(out, Segment{
Start: float64(s.Offsets.From) / 1000.0,
End: float64(s.Offsets.To) / 1000.0,
Text: s.Text,
})
}
return out, nil
}