Initial push to gitea

This commit is contained in:
2026-05-10 13:37:17 -06:00
commit 54629aecad
20 changed files with 2381 additions and 0 deletions

42
internal/audio/audio.go Normal file
View File

@@ -0,0 +1,42 @@
// Package audio normalizes arbitrary audio/video inputs into a whisper.cpp-friendly
// 16 kHz mono PCM WAV file using ffmpeg.
package audio
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
)
// ExtractWAV runs ffmpeg to convert input (audio or video) into a 16kHz mono
// signed-16-bit PCM WAV file at outPath. ffmpeg must be on PATH.
func ExtractWAV(ctx context.Context, input, outPath string) error {
if _, err := exec.LookPath("ffmpeg"); err != nil {
return fmt.Errorf("ffmpeg not found on PATH: %w", err)
}
if _, err := os.Stat(input); err != nil {
return fmt.Errorf("input not readable: %w", err)
}
if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil {
return err
}
cmd := exec.CommandContext(ctx, "ffmpeg",
"-y",
"-loglevel", "error",
"-i", input,
"-vn",
"-ac", "1",
"-ar", "16000",
"-c:a", "pcm_s16le",
outPath,
)
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return fmt.Errorf("ffmpeg: %w", err)
}
return nil
}

204
internal/clip/clip.go Normal file
View File

@@ -0,0 +1,204 @@
// Package clip selects the best 6090s window from a timestamped transcript
// (using a Summarizer to do the picking) and runs ffmpeg to cut that window
// out of the original media.
package clip
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"publish/internal/summarize"
"publish/internal/transcribe"
)
// Selection is the LLM's chosen clip window plus metadata.
type Selection struct {
StartSeconds float64 `json:"start_seconds"`
EndSeconds float64 `json:"end_seconds"`
Title string `json:"title"`
Hook string `json:"hook"`
Quote string `json:"quote"`
Reasoning string `json:"reasoning"`
}
// Duration returns the selected window length in seconds.
func (s Selection) Duration() float64 { return s.EndSeconds - s.StartSeconds }
// Pick asks the summarizer to choose the best window in the given segments,
// using promptTemplate (which may contain {{MIN_SECONDS}} / {{MAX_SECONDS}}
// placeholders). It clamps and validates the returned window against minSec
// and maxSec.
func Pick(ctx context.Context, sum summarize.Summarizer, promptTemplate string, segs []transcribe.Segment, minSec, maxSec float64) (Selection, string, error) {
if len(segs) == 0 {
return Selection{}, "", fmt.Errorf("no transcript segments to choose from")
}
prompt := strings.NewReplacer(
"{{MIN_SECONDS}}", fmt.Sprintf("%g", minSec),
"{{MAX_SECONDS}}", fmt.Sprintf("%g", maxSec),
).Replace(promptTemplate)
body := transcribe.FormatForLLM(segs)
raw, err := sum.Summarize(ctx, prompt, body)
if err != nil {
return Selection{}, "", err
}
jsonText, err := extractJSONObject(raw)
if err != nil {
return Selection{}, raw, fmt.Errorf("could not find JSON object in model output: %w", err)
}
var sel Selection
if err := json.Unmarshal([]byte(jsonText), &sel); err != nil {
return Selection{}, raw, fmt.Errorf("parsing selection JSON: %w\n--- raw ---\n%s", err, jsonText)
}
if err := validate(&sel, segs, minSec, maxSec); err != nil {
return sel, raw, err
}
return sel, raw, nil
}
func validate(sel *Selection, segs []transcribe.Segment, minSec, maxSec float64) error {
if sel.EndSeconds <= sel.StartSeconds {
return fmt.Errorf("invalid window: end (%g) <= start (%g)", sel.EndSeconds, sel.StartSeconds)
}
maxEnd := segs[len(segs)-1].End
if sel.StartSeconds < 0 || sel.EndSeconds > maxEnd+1.0 {
return fmt.Errorf("window [%g, %g] is outside transcript bounds [0, %g]",
sel.StartSeconds, sel.EndSeconds, maxEnd)
}
dur := sel.Duration()
// Allow small slop on either side; otherwise reject.
if dur < minSec-2 || dur > maxSec+2 {
return fmt.Errorf("window duration %.1fs is outside requested bounds [%g, %g]",
dur, minSec, maxSec)
}
return nil
}
// extractJSONObject pulls the first balanced {...} object out of s, ignoring
// braces that appear inside JSON strings. Useful when the model wraps its
// answer in prose despite being told not to.
func extractJSONObject(s string) (string, error) {
start := strings.Index(s, "{")
if start < 0 {
return "", fmt.Errorf("no '{' in response")
}
depth := 0
inStr := false
esc := false
for i := start; i < len(s); i++ {
c := s[i]
if inStr {
switch {
case esc:
esc = false
case c == '\\':
esc = true
case c == '"':
inStr = false
}
continue
}
switch c {
case '"':
inStr = true
case '{':
depth++
case '}':
depth--
if depth == 0 {
return s[start : i+1], nil
}
}
}
return "", fmt.Errorf("unbalanced braces")
}
// portraitFilter center-crops any source aspect ratio to a 9:16 sub-rectangle
// (no distortion, just cropping) and scales to 1080x1920. The min() expressions
// pick the largest 9:16 box that fits inside the source: 16:9 sources lose the
// left/right edges, 9:16 sources are unchanged, and 4:3 / 1:1 sources crop the
// sides. setsar=1 forces square pixels.
const portraitFilter = `crop=min(iw\,ih*9/16):min(ih\,iw*16/9),scale=1080:1920,setsar=1`
// MaxClipBytes is the hard size ceiling enforced by ffmpeg's -fs flag.
// Realistic 6090s 1080x1920 H.264 clips at CRF 23 land 30100 MB, so this is
// a safety cap rather than a target.
const MaxClipBytes = 1 << 30 // 1 GiB
// Extract runs ffmpeg to cut [start, end) seconds out of input into outPath.
// For video inputs, the clip is re-encoded as a 1080x1920 portrait (9:16
// center-crop) under a 1 GiB size cap. If reencode is false, stream copy is
// used (fast, keyframe-aligned, but the source aspect ratio is preserved).
func Extract(ctx context.Context, input string, sel Selection, outPath string, reencode bool) error {
if _, err := exec.LookPath("ffmpeg"); err != nil {
return fmt.Errorf("ffmpeg not on PATH: %w", err)
}
if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil {
return err
}
dur := sel.EndSeconds - sel.StartSeconds
args := []string{
"-y",
"-loglevel", "error",
"-ss", fmt.Sprintf("%.3f", sel.StartSeconds),
"-i", input,
"-t", fmt.Sprintf("%.3f", dur),
}
if reencode {
if hasVideoExt(input) {
args = append(args,
"-vf", portraitFilter,
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "aac",
"-b:a", "128k",
"-movflags", "+faststart",
)
} else {
args = append(args,
"-vn",
"-c:a", "aac",
"-b:a", "128k",
)
}
} else {
args = append(args, "-c", "copy")
}
args = append(args, "-fs", fmt.Sprintf("%d", MaxClipBytes), outPath)
cmd := exec.CommandContext(ctx, "ffmpeg", args...)
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return fmt.Errorf("ffmpeg cut: %w", err)
}
return nil
}
func hasVideoExt(p string) bool {
switch strings.ToLower(filepath.Ext(p)) {
case ".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v", ".flv", ".ts":
return true
}
return false
}
// DefaultOutputPath builds <input-without-ext>.clip<ext> for video inputs and
// .m4a for audio inputs.
func DefaultOutputPath(input string) string {
base := strings.TrimSuffix(input, filepath.Ext(input))
if hasVideoExt(input) {
return base + ".clip" + filepath.Ext(input)
}
return base + ".clip.m4a"
}

View File

@@ -0,0 +1,37 @@
package clip
import "testing"
func TestExtractJSONObject(t *testing.T) {
cases := []struct {
name string
in string
want string
}{
{"raw json", `{"a":1}`, `{"a":1}`},
{"with prose", "Sure, here you go:\n{\"a\":1}\nThanks", `{"a":1}`},
{"with fence", "```json\n{\"a\":1}\n```", `{"a":1}`},
{"nested", `prelude {"a":{"b":2},"c":3} trailing`, `{"a":{"b":2},"c":3}`},
{"brace in string", `{"text":"hello {world}"}`, `{"text":"hello {world}"}`},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
got, err := extractJSONObject(c.in)
if err != nil {
t.Fatalf("err: %v", err)
}
if got != c.want {
t.Errorf("got %q want %q", got, c.want)
}
})
}
}
func TestExtractJSONObjectMissing(t *testing.T) {
if _, err := extractJSONObject("no json here"); err == nil {
t.Error("expected error for missing JSON")
}
if _, err := extractJSONObject(`{"unterminated":`); err == nil {
t.Error("expected error for unbalanced braces")
}
}

View File

@@ -0,0 +1,30 @@
package output
import (
"fmt"
"os/exec"
"strings"
)
// CopyToClipboard tries platform-appropriate clipboard tools and writes data
// to the first one available: wl-copy (Wayland), xclip (X11), pbcopy (macOS).
// Returns the tool name used or an error if none are available.
func CopyToClipboard(data string) (string, error) {
candidates := [][]string{
{"wl-copy"},
{"xclip", "-selection", "clipboard"},
{"pbcopy"},
}
for _, c := range candidates {
if _, err := exec.LookPath(c[0]); err != nil {
continue
}
cmd := exec.Command(c[0], c[1:]...)
cmd.Stdin = strings.NewReader(data)
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("%s: %w", c[0], err)
}
return c[0], nil
}
return "", fmt.Errorf("no clipboard tool found (tried wl-copy, xclip, pbcopy)")
}

154
internal/output/spotify.go Normal file
View File

@@ -0,0 +1,154 @@
// Package output renders summaries to user-visible formats. Markdown is
// passed through; Spotify HTML uses the small tag subset that Spotify for
// Podcasters' show-notes editor accepts (b, i, a, ul/ol/li, paragraphs).
package output
import (
"regexp"
"strings"
)
var (
reBoldStar = regexp.MustCompile(`\*\*([^*\n]+)\*\*`)
reBoldUnder = regexp.MustCompile(`__([^_\n]+)__`)
reItalicStar = regexp.MustCompile(`\*([^*\n]+)\*`)
reItalicUnder = regexp.MustCompile(`(^|[\s(])_([^_\n]+)_($|[\s).,!?;:])`)
reLink = regexp.MustCompile(`\[([^\]]+)\]\(([^)\s]+)\)`)
reInlineCode = regexp.MustCompile("`([^`\n]+)`")
)
// MarkdownToSpotifyHTML converts a markdown summary into the limited HTML
// subset Spotify for Podcasters renders. Unknown markdown structures degrade
// to plain text rather than producing rejected tags.
func MarkdownToSpotifyHTML(md string) string {
lines := strings.Split(strings.ReplaceAll(md, "\r\n", "\n"), "\n")
var out strings.Builder
listKind := "" // "ul" or "ol" while we're inside a list
flushList := func() {
if listKind != "" {
out.WriteString("</" + listKind + ">\n")
listKind = ""
}
}
openList := func(kind string) {
if listKind != kind {
flushList()
out.WriteString("<" + kind + ">\n")
listKind = kind
}
}
paragraph := []string{}
flushPara := func() {
if len(paragraph) == 0 {
return
}
text := strings.Join(paragraph, " ")
out.WriteString("<p>" + inline(text) + "</p>\n")
paragraph = paragraph[:0]
}
for _, raw := range lines {
line := strings.TrimRight(raw, " \t")
trim := strings.TrimSpace(line)
// Blank line: end current paragraph/list block.
if trim == "" {
flushPara()
flushList()
continue
}
// Horizontal rule.
if trim == "---" || trim == "***" || trim == "___" {
flushPara()
flushList()
continue
}
// Heading -> bold paragraph.
if h := headingText(trim); h != "" {
flushPara()
flushList()
out.WriteString("<p><b>" + inline(h) + "</b></p>\n")
continue
}
// Blockquote -> italic paragraph.
if strings.HasPrefix(trim, "> ") {
flushPara()
flushList()
out.WriteString("<p><i>" + inline(strings.TrimPrefix(trim, "> ")) + "</i></p>\n")
continue
}
// Unordered list item.
if strings.HasPrefix(trim, "- ") || strings.HasPrefix(trim, "* ") || strings.HasPrefix(trim, "+ ") {
flushPara()
openList("ul")
out.WriteString(" <li>" + inline(trim[2:]) + "</li>\n")
continue
}
// Ordered list item like "1. text".
if item, ok := orderedItem(trim); ok {
flushPara()
openList("ol")
out.WriteString(" <li>" + inline(item) + "</li>\n")
continue
}
// Anything else: append to current paragraph.
flushList()
paragraph = append(paragraph, trim)
}
flushPara()
flushList()
return strings.TrimRight(out.String(), "\n")
}
func headingText(s string) string {
// Up to 6 leading '#' followed by a space.
hashes := 0
for hashes < len(s) && s[hashes] == '#' {
hashes++
}
if hashes == 0 || hashes > 6 || hashes >= len(s) || s[hashes] != ' ' {
return ""
}
return strings.TrimSpace(s[hashes+1:])
}
func orderedItem(s string) (string, bool) {
i := 0
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
i++
}
if i == 0 || i+1 >= len(s) || s[i] != '.' || s[i+1] != ' ' {
return "", false
}
return strings.TrimSpace(s[i+2:]), true
}
func inline(s string) string {
s = escapeHTML(s)
s = reInlineCode.ReplaceAllString(s, "$1")
s = reBoldStar.ReplaceAllString(s, "<b>$1</b>")
s = reBoldUnder.ReplaceAllString(s, "<b>$1</b>")
s = reItalicStar.ReplaceAllString(s, "<i>$1</i>")
s = reItalicUnder.ReplaceAllString(s, "$1<i>$2</i>$3")
s = reLink.ReplaceAllString(s, `<a href="$2">$1</a>`)
return s
}
func escapeHTML(s string) string {
r := strings.NewReplacer(
"&", "&amp;",
"<", "&lt;",
">", "&gt;",
)
return r.Replace(s)
}

View File

@@ -0,0 +1,67 @@
package output
import (
"strings"
"testing"
)
func TestMarkdownToSpotifyHTML(t *testing.T) {
in := `# Sermon Title
**Speaker:** Pastor Bob
**Scripture:** John 3:16
## Overview
This was a *short* message about hope. See [the site](https://example.com).
## Key Points
- First point
- Second point with **bold** text
- Third one
1. Step one
2. Step two
> A pithy quote.
`
got := MarkdownToSpotifyHTML(in)
mustContain := []string{
"<p><b>Sermon Title</b></p>",
"<b>Speaker:</b>",
"<p><b>Overview</b></p>",
"<i>short</i>",
`<a href="https://example.com">the site</a>`,
"<ul>",
"<li>First point</li>",
"<li>Second point with <b>bold</b> text</li>",
"</ul>",
"<ol>",
"<li>Step one</li>",
"</ol>",
"<p><i>A pithy quote.</i></p>",
}
for _, s := range mustContain {
if !strings.Contains(got, s) {
t.Errorf("expected output to contain %q\n--- got ---\n%s", s, got)
}
}
mustNotContain := []string{"<h1>", "<h2>", "<blockquote>", "**", "##"}
for _, s := range mustNotContain {
if strings.Contains(got, s) {
t.Errorf("did not expect output to contain %q\n--- got ---\n%s", s, got)
}
}
}
func TestEscapesHTML(t *testing.T) {
got := MarkdownToSpotifyHTML("A <script>tag</script> & ampersand")
if strings.Contains(got, "<script>") {
t.Errorf("unescaped <script>: %s", got)
}
if !strings.Contains(got, "&amp;") {
t.Errorf("expected &amp; in: %s", got)
}
}

View File

@@ -0,0 +1,123 @@
package summarize
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"time"
)
// Anthropic talks to the Claude Messages API directly via net/http to avoid an
// SDK dependency. Requires ANTHROPIC_API_KEY (or APIKey set explicitly).
type Anthropic struct {
APIKey string
Model string
MaxTokens int
BaseURL string // optional override; defaults to https://api.anthropic.com
Client *http.Client
}
func (a *Anthropic) Name() string { return "anthropic-api" }
type anthroMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type anthroRequest struct {
Model string `json:"model"`
MaxTokens int `json:"max_tokens"`
System string `json:"system,omitempty"`
Messages []anthroMessage `json:"messages"`
}
type anthroContentBlock struct {
Type string `json:"type"`
Text string `json:"text"`
}
type anthroResponse struct {
Content []anthroContentBlock `json:"content"`
Error *struct {
Type string `json:"type"`
Message string `json:"message"`
} `json:"error,omitempty"`
}
func (a *Anthropic) Summarize(ctx context.Context, systemPrompt, userContent string) (string, error) {
key := a.APIKey
if key == "" {
key = os.Getenv("ANTHROPIC_API_KEY")
}
if key == "" {
return "", fmt.Errorf("ANTHROPIC_API_KEY is not set")
}
model := a.Model
if model == "" {
model = "claude-sonnet-4-6"
}
maxTokens := a.MaxTokens
if maxTokens == 0 {
maxTokens = 4096
}
baseURL := a.BaseURL
if baseURL == "" {
baseURL = "https://api.anthropic.com"
}
client := a.Client
if client == nil {
client = &http.Client{Timeout: 5 * time.Minute}
}
body := anthroRequest{
Model: model,
MaxTokens: maxTokens,
System: systemPrompt,
Messages: []anthroMessage{
{Role: "user", Content: userContent},
},
}
buf, err := json.Marshal(body)
if err != nil {
return "", err
}
req, err := http.NewRequestWithContext(ctx, "POST", baseURL+"/v1/messages", bytes.NewReader(buf))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("x-api-key", key)
req.Header.Set("anthropic-version", "2023-06-01")
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode/100 != 2 {
return "", fmt.Errorf("anthropic API %d: %s", resp.StatusCode, string(respBody))
}
var out anthroResponse
if err := json.Unmarshal(respBody, &out); err != nil {
return "", fmt.Errorf("decoding anthropic response: %w", err)
}
if out.Error != nil {
return "", fmt.Errorf("anthropic error: %s: %s", out.Error.Type, out.Error.Message)
}
var text bytes.Buffer
for _, c := range out.Content {
if c.Type == "text" {
text.WriteString(c.Text)
}
}
return text.String(), nil
}

View File

@@ -0,0 +1,49 @@
package summarize
import (
"bytes"
"context"
"fmt"
"os/exec"
"strings"
)
// ClaudeCLI shells out to the `claude` CLI in print mode. The transcript is
// sent on stdin so we don't bump into ARG_MAX for very long services.
type ClaudeCLI struct {
// Bin is the binary name; defaults to "claude".
Bin string
// Model passes through to `claude --model`. Empty leaves the CLI default.
Model string
// ExtraArgs are appended verbatim before the prompt arg.
ExtraArgs []string
}
func (c *ClaudeCLI) Name() string { return "claude-cli" }
func (c *ClaudeCLI) Summarize(ctx context.Context, systemPrompt, userContent string) (string, error) {
bin := c.Bin
if bin == "" {
bin = "claude"
}
if _, err := exec.LookPath(bin); err != nil {
return "", fmt.Errorf("%q not on PATH: %w", bin, err)
}
args := []string{"-p"}
if c.Model != "" {
args = append(args, "--model", c.Model)
}
args = append(args, c.ExtraArgs...)
args = append(args, systemPrompt)
cmd := exec.CommandContext(ctx, bin, args...)
cmd.Stdin = strings.NewReader(userContent)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("%s: %w (stderr: %s)", bin, err, strings.TrimSpace(stderr.String()))
}
return strings.TrimSpace(stdout.String()), nil
}

View File

@@ -0,0 +1,13 @@
// Package summarize turns a transcript + system prompt into a markdown summary.
package summarize
import "context"
// Summarizer produces a markdown summary (or other generation) guided by
// systemPrompt and given the user-message body. The body is passed verbatim:
// callers are responsible for any framing like "Transcript:", "Producer's
// notes:", or timestamped segment formatting.
type Summarizer interface {
Summarize(ctx context.Context, systemPrompt, userContent string) (string, error)
Name() string
}

View File

@@ -0,0 +1,49 @@
package transcribe
import (
"fmt"
"strings"
)
// Segment is one timestamped chunk of a transcript.
type Segment struct {
Start float64 // seconds from start of audio
End float64
Text string
}
// PlainText joins all segments into a single transcript.
func PlainText(segs []Segment) string {
var b strings.Builder
for _, s := range segs {
b.WriteString(strings.TrimSpace(s.Text))
b.WriteByte(' ')
}
return strings.TrimSpace(b.String())
}
// FormatForLLM renders segments as one timestamped line each, suitable for
// feeding to a model that needs to pick a time window.
//
// [mm:ss] [mm:ss] text
func FormatForLLM(segs []Segment) string {
var b strings.Builder
for _, s := range segs {
fmt.Fprintf(&b, "[%s] [%s] %s\n", formatTS(s.Start), formatTS(s.End), strings.TrimSpace(s.Text))
}
return b.String()
}
func formatTS(seconds float64) string {
if seconds < 0 {
seconds = 0
}
total := int(seconds)
h := total / 3600
m := (total % 3600) / 60
s := total % 60
if h > 0 {
return fmt.Sprintf("%02d:%02d:%02d", h, m, s)
}
return fmt.Sprintf("%02d:%02d", m, s)
}

View File

@@ -0,0 +1,10 @@
// Package transcribe converts a normalized WAV file into plain-text transcript.
package transcribe
import "context"
// Transcriber turns a 16kHz mono WAV at wavPath into a plaintext transcript.
type Transcriber interface {
Transcribe(ctx context.Context, wavPath string) (string, error)
Name() string
}

View File

@@ -0,0 +1,213 @@
package transcribe
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"time"
)
// WhisperCPP shells out to a whisper.cpp CLI binary (whisper-cli, whisper-cpp,
// or legacy `main`) and reads its `-otxt` output. The binary must produce a
// .txt file next to the requested output basename.
type WhisperCPP struct {
// Bin is the whisper.cpp binary name or absolute path.
Bin string
// Model is the path to a ggml whisper model (.bin).
Model string
// Language to force; empty means auto-detect.
Language string
// Threads to use; 0 lets whisper.cpp pick.
Threads int
// ExtraArgs are appended to the command verbatim.
ExtraArgs []string
// Verbose enables per-step diagnostic logging to stderr (which probe ran,
// which backend was selected, etc.). The selected backend is always logged
// on a single stderr line regardless of this flag.
Verbose bool
}
func (w *WhisperCPP) Name() string { return "whisper.cpp" }
func (w *WhisperCPP) Transcribe(ctx context.Context, wavPath string) (string, error) {
segs, err := w.TranscribeSegments(ctx, wavPath)
if err != nil {
return "", err
}
return PlainText(segs), nil
}
// TranscribeSegments runs whisper.cpp with JSON output and returns the
// per-segment timestamps (in seconds) and text.
func (w *WhisperCPP) TranscribeSegments(ctx context.Context, wavPath string) ([]Segment, error) {
bin, err := w.resolveBin()
if err != nil {
return nil, err
}
if w.Model == "" {
return nil, fmt.Errorf("whisper.cpp model path is required (--whisper-model)")
}
if _, err := os.Stat(w.Model); err != nil {
return nil, fmt.Errorf("whisper model not readable at %s: %w", w.Model, err)
}
dir := filepath.Dir(wavPath)
base := strings.TrimSuffix(filepath.Base(wavPath), filepath.Ext(wavPath))
outBase := filepath.Join(dir, base)
jsonPath := outBase + ".json"
_ = os.Remove(jsonPath)
args := []string{
"-m", w.Model,
"-f", wavPath,
"-oj",
"-of", outBase,
"--no-prints",
}
if w.Language != "" {
args = append(args, "-l", w.Language)
}
if w.Threads > 0 {
args = append(args, "-t", fmt.Sprintf("%d", w.Threads))
}
args = append(args, w.ExtraArgs...)
cmd := exec.CommandContext(ctx, bin, args...)
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("%s: %w", bin, err)
}
data, err := os.ReadFile(jsonPath)
if err != nil {
return nil, fmt.Errorf("reading whisper json %s: %w", jsonPath, err)
}
return parseWhisperJSON(data)
}
// gpuBackend describes one accelerated whisper.cpp build we may pick at
// runtime. The binary is conventionally installed at ~/.local/bin/<bin> (or
// anywhere on PATH); the probe is a fast command that exits 0 only when the
// matching GPU runtime is actually usable on this machine.
type gpuBackend struct {
name string
bin string
probe []string
}
var gpuBackends = []gpuBackend{
{"CUDA", "whisper-cli-cuda", []string{"nvidia-smi", "-L"}},
{"ROCm", "whisper-cli-rocm", []string{"rocminfo"}},
{"Vulkan", "whisper-cli-vulkan", []string{"vulkaninfo", "--summary"}},
}
func (w *WhisperCPP) resolveBin() (string, error) {
if w.Bin != "" {
if _, err := exec.LookPath(w.Bin); err == nil {
return w.Bin, nil
}
if _, err := os.Stat(w.Bin); err == nil {
return w.Bin, nil
}
return "", fmt.Errorf("whisper.cpp binary %q not found on PATH", w.Bin)
}
// Metal is always usable on macOS — no separate probe needed; if the
// binary exists we trust it.
if runtime.GOOS == "darwin" {
if path := findBinary("whisper-cli-metal"); path != "" {
fmt.Fprintf(os.Stderr, "whisper: using Metal backend (%s)\n", path)
return path, nil
}
}
for _, b := range gpuBackends {
path := findBinary(b.bin)
if path == "" {
if w.Verbose {
fmt.Fprintf(os.Stderr, "whisper: no %s binary (%s) installed; skipping\n", b.name, b.bin)
}
continue
}
if !probeSucceeds(b.probe) {
if w.Verbose {
fmt.Fprintf(os.Stderr, "whisper: %s binary present at %s but %s probe failed; trying next\n", b.name, path, b.probe[0])
}
continue
}
fmt.Fprintf(os.Stderr, "whisper: using %s backend (%s)\n", b.name, path)
return path, nil
}
for _, alt := range []string{"whisper-cli", "whisper-cpp", "main"} {
if path, e := exec.LookPath(alt); e == nil {
fmt.Fprintf(os.Stderr, "whisper: using CPU backend (%s)\n", path)
return path, nil
}
}
return "", fmt.Errorf("no whisper.cpp binary found (tried GPU builds whisper-cli-{cuda,rocm,vulkan} in ~/.local/bin and PATH, then CPU whisper-cli/whisper-cpp/main on PATH); pass --whisper-bin")
}
// findBinary looks for an executable first in ~/.local/bin (the convention
// for hand-built backends), then on PATH. Returns "" if neither has it.
func findBinary(name string) string {
if home, err := os.UserHomeDir(); err == nil {
candidate := filepath.Join(home, ".local", "bin", name)
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
return candidate
}
}
if path, err := exec.LookPath(name); err == nil {
return path
}
return ""
}
// probeSucceeds runs the probe with a short timeout and reports whether it
// exited 0. Used to confirm the GPU runtime is actually usable before we
// commit to its whisper-cli build.
func probeSucceeds(argv []string) bool {
if _, err := exec.LookPath(argv[0]); err != nil {
return false
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
return cmd.Run() == nil
}
// whisperJSONFile mirrors the structure whisper.cpp writes with -oj.
type whisperJSONFile struct {
Transcription []struct {
Offsets struct {
From int64 `json:"from"`
To int64 `json:"to"`
} `json:"offsets"`
Text string `json:"text"`
} `json:"transcription"`
}
func parseWhisperJSON(data []byte) ([]Segment, error) {
var f whisperJSONFile
if err := json.Unmarshal(data, &f); err != nil {
return nil, fmt.Errorf("parsing whisper JSON: %w", err)
}
if len(f.Transcription) == 0 {
return nil, fmt.Errorf("whisper produced no transcription segments")
}
out := make([]Segment, 0, len(f.Transcription))
for _, s := range f.Transcription {
out = append(out, Segment{
Start: float64(s.Offsets.From) / 1000.0,
End: float64(s.Offsets.To) / 1000.0,
Text: s.Text,
})
}
return out, nil
}