214 lines
6.2 KiB
Go
214 lines
6.2 KiB
Go
package transcribe
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// WhisperCPP shells out to a whisper.cpp CLI binary (whisper-cli, whisper-cpp,
|
|
// or legacy `main`) and reads its `-otxt` output. The binary must produce a
|
|
// .txt file next to the requested output basename.
|
|
type WhisperCPP struct {
|
|
// Bin is the whisper.cpp binary name or absolute path.
|
|
Bin string
|
|
// Model is the path to a ggml whisper model (.bin).
|
|
Model string
|
|
// Language to force; empty means auto-detect.
|
|
Language string
|
|
// Threads to use; 0 lets whisper.cpp pick.
|
|
Threads int
|
|
// ExtraArgs are appended to the command verbatim.
|
|
ExtraArgs []string
|
|
// Verbose enables per-step diagnostic logging to stderr (which probe ran,
|
|
// which backend was selected, etc.). The selected backend is always logged
|
|
// on a single stderr line regardless of this flag.
|
|
Verbose bool
|
|
}
|
|
|
|
func (w *WhisperCPP) Name() string { return "whisper.cpp" }
|
|
|
|
func (w *WhisperCPP) Transcribe(ctx context.Context, wavPath string) (string, error) {
|
|
segs, err := w.TranscribeSegments(ctx, wavPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return PlainText(segs), nil
|
|
}
|
|
|
|
// TranscribeSegments runs whisper.cpp with JSON output and returns the
|
|
// per-segment timestamps (in seconds) and text.
|
|
func (w *WhisperCPP) TranscribeSegments(ctx context.Context, wavPath string) ([]Segment, error) {
|
|
bin, err := w.resolveBin()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if w.Model == "" {
|
|
return nil, fmt.Errorf("whisper.cpp model path is required (--whisper-model)")
|
|
}
|
|
if _, err := os.Stat(w.Model); err != nil {
|
|
return nil, fmt.Errorf("whisper model not readable at %s: %w", w.Model, err)
|
|
}
|
|
|
|
dir := filepath.Dir(wavPath)
|
|
base := strings.TrimSuffix(filepath.Base(wavPath), filepath.Ext(wavPath))
|
|
outBase := filepath.Join(dir, base)
|
|
jsonPath := outBase + ".json"
|
|
_ = os.Remove(jsonPath)
|
|
|
|
args := []string{
|
|
"-m", w.Model,
|
|
"-f", wavPath,
|
|
"-oj",
|
|
"-of", outBase,
|
|
"--no-prints",
|
|
}
|
|
if w.Language != "" {
|
|
args = append(args, "-l", w.Language)
|
|
}
|
|
if w.Threads > 0 {
|
|
args = append(args, "-t", fmt.Sprintf("%d", w.Threads))
|
|
}
|
|
args = append(args, w.ExtraArgs...)
|
|
|
|
cmd := exec.CommandContext(ctx, bin, args...)
|
|
cmd.Stdout = os.Stderr
|
|
cmd.Stderr = os.Stderr
|
|
if err := cmd.Run(); err != nil {
|
|
return nil, fmt.Errorf("%s: %w", bin, err)
|
|
}
|
|
|
|
data, err := os.ReadFile(jsonPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading whisper json %s: %w", jsonPath, err)
|
|
}
|
|
return parseWhisperJSON(data)
|
|
}
|
|
|
|
// gpuBackend describes one accelerated whisper.cpp build we may pick at
|
|
// runtime. The binary is conventionally installed at ~/.local/bin/<bin> (or
|
|
// anywhere on PATH); the probe is a fast command that exits 0 only when the
|
|
// matching GPU runtime is actually usable on this machine.
|
|
type gpuBackend struct {
|
|
name string
|
|
bin string
|
|
probe []string
|
|
}
|
|
|
|
var gpuBackends = []gpuBackend{
|
|
{"CUDA", "whisper-cli-cuda", []string{"nvidia-smi", "-L"}},
|
|
{"ROCm", "whisper-cli-rocm", []string{"rocminfo"}},
|
|
{"Vulkan", "whisper-cli-vulkan", []string{"vulkaninfo", "--summary"}},
|
|
}
|
|
|
|
func (w *WhisperCPP) resolveBin() (string, error) {
|
|
if w.Bin != "" {
|
|
if _, err := exec.LookPath(w.Bin); err == nil {
|
|
return w.Bin, nil
|
|
}
|
|
if _, err := os.Stat(w.Bin); err == nil {
|
|
return w.Bin, nil
|
|
}
|
|
return "", fmt.Errorf("whisper.cpp binary %q not found on PATH", w.Bin)
|
|
}
|
|
|
|
// Metal is always usable on macOS — no separate probe needed; if the
|
|
// binary exists we trust it.
|
|
if runtime.GOOS == "darwin" {
|
|
if path := findBinary("whisper-cli-metal"); path != "" {
|
|
fmt.Fprintf(os.Stderr, "whisper: using Metal backend (%s)\n", path)
|
|
return path, nil
|
|
}
|
|
}
|
|
|
|
for _, b := range gpuBackends {
|
|
path := findBinary(b.bin)
|
|
if path == "" {
|
|
if w.Verbose {
|
|
fmt.Fprintf(os.Stderr, "whisper: no %s binary (%s) installed; skipping\n", b.name, b.bin)
|
|
}
|
|
continue
|
|
}
|
|
if !probeSucceeds(b.probe) {
|
|
if w.Verbose {
|
|
fmt.Fprintf(os.Stderr, "whisper: %s binary present at %s but %s probe failed; trying next\n", b.name, path, b.probe[0])
|
|
}
|
|
continue
|
|
}
|
|
fmt.Fprintf(os.Stderr, "whisper: using %s backend (%s)\n", b.name, path)
|
|
return path, nil
|
|
}
|
|
|
|
for _, alt := range []string{"whisper-cli", "whisper-cpp", "main"} {
|
|
if path, e := exec.LookPath(alt); e == nil {
|
|
fmt.Fprintf(os.Stderr, "whisper: using CPU backend (%s)\n", path)
|
|
return path, nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("no whisper.cpp binary found (tried GPU builds whisper-cli-{cuda,rocm,vulkan} in ~/.local/bin and PATH, then CPU whisper-cli/whisper-cpp/main on PATH); pass --whisper-bin")
|
|
}
|
|
|
|
// findBinary looks for an executable first in ~/.local/bin (the convention
|
|
// for hand-built backends), then on PATH. Returns "" if neither has it.
|
|
func findBinary(name string) string {
|
|
if home, err := os.UserHomeDir(); err == nil {
|
|
candidate := filepath.Join(home, ".local", "bin", name)
|
|
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
|
|
return candidate
|
|
}
|
|
}
|
|
if path, err := exec.LookPath(name); err == nil {
|
|
return path
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// probeSucceeds runs the probe with a short timeout and reports whether it
|
|
// exited 0. Used to confirm the GPU runtime is actually usable before we
|
|
// commit to its whisper-cli build.
|
|
func probeSucceeds(argv []string) bool {
|
|
if _, err := exec.LookPath(argv[0]); err != nil {
|
|
return false
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
|
|
return cmd.Run() == nil
|
|
}
|
|
|
|
// whisperJSONFile mirrors the structure whisper.cpp writes with -oj.
|
|
type whisperJSONFile struct {
|
|
Transcription []struct {
|
|
Offsets struct {
|
|
From int64 `json:"from"`
|
|
To int64 `json:"to"`
|
|
} `json:"offsets"`
|
|
Text string `json:"text"`
|
|
} `json:"transcription"`
|
|
}
|
|
|
|
func parseWhisperJSON(data []byte) ([]Segment, error) {
|
|
var f whisperJSONFile
|
|
if err := json.Unmarshal(data, &f); err != nil {
|
|
return nil, fmt.Errorf("parsing whisper JSON: %w", err)
|
|
}
|
|
if len(f.Transcription) == 0 {
|
|
return nil, fmt.Errorf("whisper produced no transcription segments")
|
|
}
|
|
out := make([]Segment, 0, len(f.Transcription))
|
|
for _, s := range f.Transcription {
|
|
out = append(out, Segment{
|
|
Start: float64(s.Offsets.From) / 1000.0,
|
|
End: float64(s.Offsets.To) / 1000.0,
|
|
Text: s.Text,
|
|
})
|
|
}
|
|
return out, nil
|
|
}
|