Initial push to gitea
This commit is contained in:
42
internal/audio/audio.go
Normal file
42
internal/audio/audio.go
Normal file
@@ -0,0 +1,42 @@
|
||||
// Package audio normalizes arbitrary audio/video inputs into a whisper.cpp-friendly
|
||||
// 16 kHz mono PCM WAV file using ffmpeg.
|
||||
package audio
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// ExtractWAV runs ffmpeg to convert input (audio or video) into a 16kHz mono
|
||||
// signed-16-bit PCM WAV file at outPath. ffmpeg must be on PATH.
|
||||
func ExtractWAV(ctx context.Context, input, outPath string) error {
|
||||
if _, err := exec.LookPath("ffmpeg"); err != nil {
|
||||
return fmt.Errorf("ffmpeg not found on PATH: %w", err)
|
||||
}
|
||||
if _, err := os.Stat(input); err != nil {
|
||||
return fmt.Errorf("input not readable: %w", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ffmpeg",
|
||||
"-y",
|
||||
"-loglevel", "error",
|
||||
"-i", input,
|
||||
"-vn",
|
||||
"-ac", "1",
|
||||
"-ar", "16000",
|
||||
"-c:a", "pcm_s16le",
|
||||
outPath,
|
||||
)
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("ffmpeg: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
204
internal/clip/clip.go
Normal file
204
internal/clip/clip.go
Normal file
@@ -0,0 +1,204 @@
|
||||
// Package clip selects the best 60–90s window from a timestamped transcript
|
||||
// (using a Summarizer to do the picking) and runs ffmpeg to cut that window
|
||||
// out of the original media.
|
||||
package clip
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"publish/internal/summarize"
|
||||
"publish/internal/transcribe"
|
||||
)
|
||||
|
||||
// Selection is the LLM's chosen clip window plus metadata.
|
||||
type Selection struct {
|
||||
StartSeconds float64 `json:"start_seconds"`
|
||||
EndSeconds float64 `json:"end_seconds"`
|
||||
Title string `json:"title"`
|
||||
Hook string `json:"hook"`
|
||||
Quote string `json:"quote"`
|
||||
Reasoning string `json:"reasoning"`
|
||||
}
|
||||
|
||||
// Duration returns the selected window length in seconds.
|
||||
func (s Selection) Duration() float64 { return s.EndSeconds - s.StartSeconds }
|
||||
|
||||
// Pick asks the summarizer to choose the best window in the given segments,
|
||||
// using promptTemplate (which may contain {{MIN_SECONDS}} / {{MAX_SECONDS}}
|
||||
// placeholders). It clamps and validates the returned window against minSec
|
||||
// and maxSec.
|
||||
func Pick(ctx context.Context, sum summarize.Summarizer, promptTemplate string, segs []transcribe.Segment, minSec, maxSec float64) (Selection, string, error) {
|
||||
if len(segs) == 0 {
|
||||
return Selection{}, "", fmt.Errorf("no transcript segments to choose from")
|
||||
}
|
||||
prompt := strings.NewReplacer(
|
||||
"{{MIN_SECONDS}}", fmt.Sprintf("%g", minSec),
|
||||
"{{MAX_SECONDS}}", fmt.Sprintf("%g", maxSec),
|
||||
).Replace(promptTemplate)
|
||||
|
||||
body := transcribe.FormatForLLM(segs)
|
||||
|
||||
raw, err := sum.Summarize(ctx, prompt, body)
|
||||
if err != nil {
|
||||
return Selection{}, "", err
|
||||
}
|
||||
|
||||
jsonText, err := extractJSONObject(raw)
|
||||
if err != nil {
|
||||
return Selection{}, raw, fmt.Errorf("could not find JSON object in model output: %w", err)
|
||||
}
|
||||
var sel Selection
|
||||
if err := json.Unmarshal([]byte(jsonText), &sel); err != nil {
|
||||
return Selection{}, raw, fmt.Errorf("parsing selection JSON: %w\n--- raw ---\n%s", err, jsonText)
|
||||
}
|
||||
|
||||
if err := validate(&sel, segs, minSec, maxSec); err != nil {
|
||||
return sel, raw, err
|
||||
}
|
||||
return sel, raw, nil
|
||||
}
|
||||
|
||||
func validate(sel *Selection, segs []transcribe.Segment, minSec, maxSec float64) error {
|
||||
if sel.EndSeconds <= sel.StartSeconds {
|
||||
return fmt.Errorf("invalid window: end (%g) <= start (%g)", sel.EndSeconds, sel.StartSeconds)
|
||||
}
|
||||
maxEnd := segs[len(segs)-1].End
|
||||
if sel.StartSeconds < 0 || sel.EndSeconds > maxEnd+1.0 {
|
||||
return fmt.Errorf("window [%g, %g] is outside transcript bounds [0, %g]",
|
||||
sel.StartSeconds, sel.EndSeconds, maxEnd)
|
||||
}
|
||||
dur := sel.Duration()
|
||||
// Allow small slop on either side; otherwise reject.
|
||||
if dur < minSec-2 || dur > maxSec+2 {
|
||||
return fmt.Errorf("window duration %.1fs is outside requested bounds [%g, %g]",
|
||||
dur, minSec, maxSec)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// extractJSONObject pulls the first balanced {...} object out of s, ignoring
|
||||
// braces that appear inside JSON strings. Useful when the model wraps its
|
||||
// answer in prose despite being told not to.
|
||||
func extractJSONObject(s string) (string, error) {
|
||||
start := strings.Index(s, "{")
|
||||
if start < 0 {
|
||||
return "", fmt.Errorf("no '{' in response")
|
||||
}
|
||||
depth := 0
|
||||
inStr := false
|
||||
esc := false
|
||||
for i := start; i < len(s); i++ {
|
||||
c := s[i]
|
||||
if inStr {
|
||||
switch {
|
||||
case esc:
|
||||
esc = false
|
||||
case c == '\\':
|
||||
esc = true
|
||||
case c == '"':
|
||||
inStr = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
switch c {
|
||||
case '"':
|
||||
inStr = true
|
||||
case '{':
|
||||
depth++
|
||||
case '}':
|
||||
depth--
|
||||
if depth == 0 {
|
||||
return s[start : i+1], nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("unbalanced braces")
|
||||
}
|
||||
|
||||
// portraitFilter center-crops any source aspect ratio to a 9:16 sub-rectangle
|
||||
// (no distortion, just cropping) and scales to 1080x1920. The min() expressions
|
||||
// pick the largest 9:16 box that fits inside the source: 16:9 sources lose the
|
||||
// left/right edges, 9:16 sources are unchanged, and 4:3 / 1:1 sources crop the
|
||||
// sides. setsar=1 forces square pixels.
|
||||
const portraitFilter = `crop=min(iw\,ih*9/16):min(ih\,iw*16/9),scale=1080:1920,setsar=1`
|
||||
|
||||
// MaxClipBytes is the hard size ceiling enforced by ffmpeg's -fs flag.
|
||||
// Realistic 60–90s 1080x1920 H.264 clips at CRF 23 land 30–100 MB, so this is
|
||||
// a safety cap rather than a target.
|
||||
const MaxClipBytes = 1 << 30 // 1 GiB
|
||||
|
||||
// Extract runs ffmpeg to cut [start, end) seconds out of input into outPath.
|
||||
// For video inputs, the clip is re-encoded as a 1080x1920 portrait (9:16
|
||||
// center-crop) under a 1 GiB size cap. If reencode is false, stream copy is
|
||||
// used (fast, keyframe-aligned, but the source aspect ratio is preserved).
|
||||
func Extract(ctx context.Context, input string, sel Selection, outPath string, reencode bool) error {
|
||||
if _, err := exec.LookPath("ffmpeg"); err != nil {
|
||||
return fmt.Errorf("ffmpeg not on PATH: %w", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dur := sel.EndSeconds - sel.StartSeconds
|
||||
args := []string{
|
||||
"-y",
|
||||
"-loglevel", "error",
|
||||
"-ss", fmt.Sprintf("%.3f", sel.StartSeconds),
|
||||
"-i", input,
|
||||
"-t", fmt.Sprintf("%.3f", dur),
|
||||
}
|
||||
if reencode {
|
||||
if hasVideoExt(input) {
|
||||
args = append(args,
|
||||
"-vf", portraitFilter,
|
||||
"-c:v", "libx264",
|
||||
"-preset", "fast",
|
||||
"-crf", "23",
|
||||
"-c:a", "aac",
|
||||
"-b:a", "128k",
|
||||
"-movflags", "+faststart",
|
||||
)
|
||||
} else {
|
||||
args = append(args,
|
||||
"-vn",
|
||||
"-c:a", "aac",
|
||||
"-b:a", "128k",
|
||||
)
|
||||
}
|
||||
} else {
|
||||
args = append(args, "-c", "copy")
|
||||
}
|
||||
args = append(args, "-fs", fmt.Sprintf("%d", MaxClipBytes), outPath)
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ffmpeg", args...)
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("ffmpeg cut: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func hasVideoExt(p string) bool {
|
||||
switch strings.ToLower(filepath.Ext(p)) {
|
||||
case ".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v", ".flv", ".ts":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// DefaultOutputPath builds <input-without-ext>.clip<ext> for video inputs and
|
||||
// .m4a for audio inputs.
|
||||
func DefaultOutputPath(input string) string {
|
||||
base := strings.TrimSuffix(input, filepath.Ext(input))
|
||||
if hasVideoExt(input) {
|
||||
return base + ".clip" + filepath.Ext(input)
|
||||
}
|
||||
return base + ".clip.m4a"
|
||||
}
|
||||
37
internal/clip/clip_test.go
Normal file
37
internal/clip/clip_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package clip
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestExtractJSONObject(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{"raw json", `{"a":1}`, `{"a":1}`},
|
||||
{"with prose", "Sure, here you go:\n{\"a\":1}\nThanks", `{"a":1}`},
|
||||
{"with fence", "```json\n{\"a\":1}\n```", `{"a":1}`},
|
||||
{"nested", `prelude {"a":{"b":2},"c":3} trailing`, `{"a":{"b":2},"c":3}`},
|
||||
{"brace in string", `{"text":"hello {world}"}`, `{"text":"hello {world}"}`},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got, err := extractJSONObject(c.in)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
if got != c.want {
|
||||
t.Errorf("got %q want %q", got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractJSONObjectMissing(t *testing.T) {
|
||||
if _, err := extractJSONObject("no json here"); err == nil {
|
||||
t.Error("expected error for missing JSON")
|
||||
}
|
||||
if _, err := extractJSONObject(`{"unterminated":`); err == nil {
|
||||
t.Error("expected error for unbalanced braces")
|
||||
}
|
||||
}
|
||||
30
internal/output/clipboard.go
Normal file
30
internal/output/clipboard.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package output
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CopyToClipboard tries platform-appropriate clipboard tools and writes data
|
||||
// to the first one available: wl-copy (Wayland), xclip (X11), pbcopy (macOS).
|
||||
// Returns the tool name used or an error if none are available.
|
||||
func CopyToClipboard(data string) (string, error) {
|
||||
candidates := [][]string{
|
||||
{"wl-copy"},
|
||||
{"xclip", "-selection", "clipboard"},
|
||||
{"pbcopy"},
|
||||
}
|
||||
for _, c := range candidates {
|
||||
if _, err := exec.LookPath(c[0]); err != nil {
|
||||
continue
|
||||
}
|
||||
cmd := exec.Command(c[0], c[1:]...)
|
||||
cmd.Stdin = strings.NewReader(data)
|
||||
if err := cmd.Run(); err != nil {
|
||||
return "", fmt.Errorf("%s: %w", c[0], err)
|
||||
}
|
||||
return c[0], nil
|
||||
}
|
||||
return "", fmt.Errorf("no clipboard tool found (tried wl-copy, xclip, pbcopy)")
|
||||
}
|
||||
154
internal/output/spotify.go
Normal file
154
internal/output/spotify.go
Normal file
@@ -0,0 +1,154 @@
|
||||
// Package output renders summaries to user-visible formats. Markdown is
|
||||
// passed through; Spotify HTML uses the small tag subset that Spotify for
|
||||
// Podcasters' show-notes editor accepts (b, i, a, ul/ol/li, paragraphs).
|
||||
package output
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
reBoldStar = regexp.MustCompile(`\*\*([^*\n]+)\*\*`)
|
||||
reBoldUnder = regexp.MustCompile(`__([^_\n]+)__`)
|
||||
reItalicStar = regexp.MustCompile(`\*([^*\n]+)\*`)
|
||||
reItalicUnder = regexp.MustCompile(`(^|[\s(])_([^_\n]+)_($|[\s).,!?;:])`)
|
||||
reLink = regexp.MustCompile(`\[([^\]]+)\]\(([^)\s]+)\)`)
|
||||
reInlineCode = regexp.MustCompile("`([^`\n]+)`")
|
||||
)
|
||||
|
||||
// MarkdownToSpotifyHTML converts a markdown summary into the limited HTML
|
||||
// subset Spotify for Podcasters renders. Unknown markdown structures degrade
|
||||
// to plain text rather than producing rejected tags.
|
||||
func MarkdownToSpotifyHTML(md string) string {
|
||||
lines := strings.Split(strings.ReplaceAll(md, "\r\n", "\n"), "\n")
|
||||
|
||||
var out strings.Builder
|
||||
listKind := "" // "ul" or "ol" while we're inside a list
|
||||
flushList := func() {
|
||||
if listKind != "" {
|
||||
out.WriteString("</" + listKind + ">\n")
|
||||
listKind = ""
|
||||
}
|
||||
}
|
||||
openList := func(kind string) {
|
||||
if listKind != kind {
|
||||
flushList()
|
||||
out.WriteString("<" + kind + ">\n")
|
||||
listKind = kind
|
||||
}
|
||||
}
|
||||
|
||||
paragraph := []string{}
|
||||
flushPara := func() {
|
||||
if len(paragraph) == 0 {
|
||||
return
|
||||
}
|
||||
text := strings.Join(paragraph, " ")
|
||||
out.WriteString("<p>" + inline(text) + "</p>\n")
|
||||
paragraph = paragraph[:0]
|
||||
}
|
||||
|
||||
for _, raw := range lines {
|
||||
line := strings.TrimRight(raw, " \t")
|
||||
trim := strings.TrimSpace(line)
|
||||
|
||||
// Blank line: end current paragraph/list block.
|
||||
if trim == "" {
|
||||
flushPara()
|
||||
flushList()
|
||||
continue
|
||||
}
|
||||
|
||||
// Horizontal rule.
|
||||
if trim == "---" || trim == "***" || trim == "___" {
|
||||
flushPara()
|
||||
flushList()
|
||||
continue
|
||||
}
|
||||
|
||||
// Heading -> bold paragraph.
|
||||
if h := headingText(trim); h != "" {
|
||||
flushPara()
|
||||
flushList()
|
||||
out.WriteString("<p><b>" + inline(h) + "</b></p>\n")
|
||||
continue
|
||||
}
|
||||
|
||||
// Blockquote -> italic paragraph.
|
||||
if strings.HasPrefix(trim, "> ") {
|
||||
flushPara()
|
||||
flushList()
|
||||
out.WriteString("<p><i>" + inline(strings.TrimPrefix(trim, "> ")) + "</i></p>\n")
|
||||
continue
|
||||
}
|
||||
|
||||
// Unordered list item.
|
||||
if strings.HasPrefix(trim, "- ") || strings.HasPrefix(trim, "* ") || strings.HasPrefix(trim, "+ ") {
|
||||
flushPara()
|
||||
openList("ul")
|
||||
out.WriteString(" <li>" + inline(trim[2:]) + "</li>\n")
|
||||
continue
|
||||
}
|
||||
|
||||
// Ordered list item like "1. text".
|
||||
if item, ok := orderedItem(trim); ok {
|
||||
flushPara()
|
||||
openList("ol")
|
||||
out.WriteString(" <li>" + inline(item) + "</li>\n")
|
||||
continue
|
||||
}
|
||||
|
||||
// Anything else: append to current paragraph.
|
||||
flushList()
|
||||
paragraph = append(paragraph, trim)
|
||||
}
|
||||
|
||||
flushPara()
|
||||
flushList()
|
||||
|
||||
return strings.TrimRight(out.String(), "\n")
|
||||
}
|
||||
|
||||
func headingText(s string) string {
|
||||
// Up to 6 leading '#' followed by a space.
|
||||
hashes := 0
|
||||
for hashes < len(s) && s[hashes] == '#' {
|
||||
hashes++
|
||||
}
|
||||
if hashes == 0 || hashes > 6 || hashes >= len(s) || s[hashes] != ' ' {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(s[hashes+1:])
|
||||
}
|
||||
|
||||
func orderedItem(s string) (string, bool) {
|
||||
i := 0
|
||||
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
|
||||
i++
|
||||
}
|
||||
if i == 0 || i+1 >= len(s) || s[i] != '.' || s[i+1] != ' ' {
|
||||
return "", false
|
||||
}
|
||||
return strings.TrimSpace(s[i+2:]), true
|
||||
}
|
||||
|
||||
func inline(s string) string {
|
||||
s = escapeHTML(s)
|
||||
s = reInlineCode.ReplaceAllString(s, "$1")
|
||||
s = reBoldStar.ReplaceAllString(s, "<b>$1</b>")
|
||||
s = reBoldUnder.ReplaceAllString(s, "<b>$1</b>")
|
||||
s = reItalicStar.ReplaceAllString(s, "<i>$1</i>")
|
||||
s = reItalicUnder.ReplaceAllString(s, "$1<i>$2</i>$3")
|
||||
s = reLink.ReplaceAllString(s, `<a href="$2">$1</a>`)
|
||||
return s
|
||||
}
|
||||
|
||||
func escapeHTML(s string) string {
|
||||
r := strings.NewReplacer(
|
||||
"&", "&",
|
||||
"<", "<",
|
||||
">", ">",
|
||||
)
|
||||
return r.Replace(s)
|
||||
}
|
||||
67
internal/output/spotify_test.go
Normal file
67
internal/output/spotify_test.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package output
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMarkdownToSpotifyHTML(t *testing.T) {
|
||||
in := `# Sermon Title
|
||||
|
||||
**Speaker:** Pastor Bob
|
||||
**Scripture:** John 3:16
|
||||
|
||||
## Overview
|
||||
This was a *short* message about hope. See [the site](https://example.com).
|
||||
|
||||
## Key Points
|
||||
- First point
|
||||
- Second point with **bold** text
|
||||
- Third one
|
||||
|
||||
1. Step one
|
||||
2. Step two
|
||||
|
||||
> A pithy quote.
|
||||
`
|
||||
|
||||
got := MarkdownToSpotifyHTML(in)
|
||||
|
||||
mustContain := []string{
|
||||
"<p><b>Sermon Title</b></p>",
|
||||
"<b>Speaker:</b>",
|
||||
"<p><b>Overview</b></p>",
|
||||
"<i>short</i>",
|
||||
`<a href="https://example.com">the site</a>`,
|
||||
"<ul>",
|
||||
"<li>First point</li>",
|
||||
"<li>Second point with <b>bold</b> text</li>",
|
||||
"</ul>",
|
||||
"<ol>",
|
||||
"<li>Step one</li>",
|
||||
"</ol>",
|
||||
"<p><i>A pithy quote.</i></p>",
|
||||
}
|
||||
for _, s := range mustContain {
|
||||
if !strings.Contains(got, s) {
|
||||
t.Errorf("expected output to contain %q\n--- got ---\n%s", s, got)
|
||||
}
|
||||
}
|
||||
|
||||
mustNotContain := []string{"<h1>", "<h2>", "<blockquote>", "**", "##"}
|
||||
for _, s := range mustNotContain {
|
||||
if strings.Contains(got, s) {
|
||||
t.Errorf("did not expect output to contain %q\n--- got ---\n%s", s, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEscapesHTML(t *testing.T) {
|
||||
got := MarkdownToSpotifyHTML("A <script>tag</script> & ampersand")
|
||||
if strings.Contains(got, "<script>") {
|
||||
t.Errorf("unescaped <script>: %s", got)
|
||||
}
|
||||
if !strings.Contains(got, "&") {
|
||||
t.Errorf("expected & in: %s", got)
|
||||
}
|
||||
}
|
||||
123
internal/summarize/anthropic.go
Normal file
123
internal/summarize/anthropic.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package summarize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Anthropic talks to the Claude Messages API directly via net/http to avoid an
|
||||
// SDK dependency. Requires ANTHROPIC_API_KEY (or APIKey set explicitly).
|
||||
type Anthropic struct {
|
||||
APIKey string
|
||||
Model string
|
||||
MaxTokens int
|
||||
BaseURL string // optional override; defaults to https://api.anthropic.com
|
||||
Client *http.Client
|
||||
}
|
||||
|
||||
func (a *Anthropic) Name() string { return "anthropic-api" }
|
||||
|
||||
type anthroMessage struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type anthroRequest struct {
|
||||
Model string `json:"model"`
|
||||
MaxTokens int `json:"max_tokens"`
|
||||
System string `json:"system,omitempty"`
|
||||
Messages []anthroMessage `json:"messages"`
|
||||
}
|
||||
|
||||
type anthroContentBlock struct {
|
||||
Type string `json:"type"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
type anthroResponse struct {
|
||||
Content []anthroContentBlock `json:"content"`
|
||||
Error *struct {
|
||||
Type string `json:"type"`
|
||||
Message string `json:"message"`
|
||||
} `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
func (a *Anthropic) Summarize(ctx context.Context, systemPrompt, userContent string) (string, error) {
|
||||
key := a.APIKey
|
||||
if key == "" {
|
||||
key = os.Getenv("ANTHROPIC_API_KEY")
|
||||
}
|
||||
if key == "" {
|
||||
return "", fmt.Errorf("ANTHROPIC_API_KEY is not set")
|
||||
}
|
||||
model := a.Model
|
||||
if model == "" {
|
||||
model = "claude-sonnet-4-6"
|
||||
}
|
||||
maxTokens := a.MaxTokens
|
||||
if maxTokens == 0 {
|
||||
maxTokens = 4096
|
||||
}
|
||||
baseURL := a.BaseURL
|
||||
if baseURL == "" {
|
||||
baseURL = "https://api.anthropic.com"
|
||||
}
|
||||
client := a.Client
|
||||
if client == nil {
|
||||
client = &http.Client{Timeout: 5 * time.Minute}
|
||||
}
|
||||
|
||||
body := anthroRequest{
|
||||
Model: model,
|
||||
MaxTokens: maxTokens,
|
||||
System: systemPrompt,
|
||||
Messages: []anthroMessage{
|
||||
{Role: "user", Content: userContent},
|
||||
},
|
||||
}
|
||||
buf, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", baseURL+"/v1/messages", bytes.NewReader(buf))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("x-api-key", key)
|
||||
req.Header.Set("anthropic-version", "2023-06-01")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode/100 != 2 {
|
||||
return "", fmt.Errorf("anthropic API %d: %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
var out anthroResponse
|
||||
if err := json.Unmarshal(respBody, &out); err != nil {
|
||||
return "", fmt.Errorf("decoding anthropic response: %w", err)
|
||||
}
|
||||
if out.Error != nil {
|
||||
return "", fmt.Errorf("anthropic error: %s: %s", out.Error.Type, out.Error.Message)
|
||||
}
|
||||
|
||||
var text bytes.Buffer
|
||||
for _, c := range out.Content {
|
||||
if c.Type == "text" {
|
||||
text.WriteString(c.Text)
|
||||
}
|
||||
}
|
||||
return text.String(), nil
|
||||
}
|
||||
49
internal/summarize/claudecli.go
Normal file
49
internal/summarize/claudecli.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package summarize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ClaudeCLI shells out to the `claude` CLI in print mode. The transcript is
|
||||
// sent on stdin so we don't bump into ARG_MAX for very long services.
|
||||
type ClaudeCLI struct {
|
||||
// Bin is the binary name; defaults to "claude".
|
||||
Bin string
|
||||
// Model passes through to `claude --model`. Empty leaves the CLI default.
|
||||
Model string
|
||||
// ExtraArgs are appended verbatim before the prompt arg.
|
||||
ExtraArgs []string
|
||||
}
|
||||
|
||||
func (c *ClaudeCLI) Name() string { return "claude-cli" }
|
||||
|
||||
func (c *ClaudeCLI) Summarize(ctx context.Context, systemPrompt, userContent string) (string, error) {
|
||||
bin := c.Bin
|
||||
if bin == "" {
|
||||
bin = "claude"
|
||||
}
|
||||
if _, err := exec.LookPath(bin); err != nil {
|
||||
return "", fmt.Errorf("%q not on PATH: %w", bin, err)
|
||||
}
|
||||
|
||||
args := []string{"-p"}
|
||||
if c.Model != "" {
|
||||
args = append(args, "--model", c.Model)
|
||||
}
|
||||
args = append(args, c.ExtraArgs...)
|
||||
args = append(args, systemPrompt)
|
||||
|
||||
cmd := exec.CommandContext(ctx, bin, args...)
|
||||
cmd.Stdin = strings.NewReader(userContent)
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return "", fmt.Errorf("%s: %w (stderr: %s)", bin, err, strings.TrimSpace(stderr.String()))
|
||||
}
|
||||
return strings.TrimSpace(stdout.String()), nil
|
||||
}
|
||||
13
internal/summarize/summarize.go
Normal file
13
internal/summarize/summarize.go
Normal file
@@ -0,0 +1,13 @@
|
||||
// Package summarize turns a transcript + system prompt into a markdown summary.
|
||||
package summarize
|
||||
|
||||
import "context"
|
||||
|
||||
// Summarizer produces a markdown summary (or other generation) guided by
|
||||
// systemPrompt and given the user-message body. The body is passed verbatim:
|
||||
// callers are responsible for any framing like "Transcript:", "Producer's
|
||||
// notes:", or timestamped segment formatting.
|
||||
type Summarizer interface {
|
||||
Summarize(ctx context.Context, systemPrompt, userContent string) (string, error)
|
||||
Name() string
|
||||
}
|
||||
49
internal/transcribe/segments.go
Normal file
49
internal/transcribe/segments.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package transcribe
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Segment is one timestamped chunk of a transcript.
|
||||
type Segment struct {
|
||||
Start float64 // seconds from start of audio
|
||||
End float64
|
||||
Text string
|
||||
}
|
||||
|
||||
// PlainText joins all segments into a single transcript.
|
||||
func PlainText(segs []Segment) string {
|
||||
var b strings.Builder
|
||||
for _, s := range segs {
|
||||
b.WriteString(strings.TrimSpace(s.Text))
|
||||
b.WriteByte(' ')
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// FormatForLLM renders segments as one timestamped line each, suitable for
|
||||
// feeding to a model that needs to pick a time window.
|
||||
//
|
||||
// [mm:ss] [mm:ss] text
|
||||
func FormatForLLM(segs []Segment) string {
|
||||
var b strings.Builder
|
||||
for _, s := range segs {
|
||||
fmt.Fprintf(&b, "[%s] [%s] %s\n", formatTS(s.Start), formatTS(s.End), strings.TrimSpace(s.Text))
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func formatTS(seconds float64) string {
|
||||
if seconds < 0 {
|
||||
seconds = 0
|
||||
}
|
||||
total := int(seconds)
|
||||
h := total / 3600
|
||||
m := (total % 3600) / 60
|
||||
s := total % 60
|
||||
if h > 0 {
|
||||
return fmt.Sprintf("%02d:%02d:%02d", h, m, s)
|
||||
}
|
||||
return fmt.Sprintf("%02d:%02d", m, s)
|
||||
}
|
||||
10
internal/transcribe/transcribe.go
Normal file
10
internal/transcribe/transcribe.go
Normal file
@@ -0,0 +1,10 @@
|
||||
// Package transcribe converts a normalized WAV file into plain-text transcript.
|
||||
package transcribe
|
||||
|
||||
import "context"
|
||||
|
||||
// Transcriber turns a 16kHz mono WAV at wavPath into a plaintext transcript.
|
||||
type Transcriber interface {
|
||||
Transcribe(ctx context.Context, wavPath string) (string, error)
|
||||
Name() string
|
||||
}
|
||||
213
internal/transcribe/whispercpp.go
Normal file
213
internal/transcribe/whispercpp.go
Normal file
@@ -0,0 +1,213 @@
|
||||
package transcribe
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// WhisperCPP shells out to a whisper.cpp CLI binary (whisper-cli, whisper-cpp,
|
||||
// or legacy `main`) and reads its `-otxt` output. The binary must produce a
|
||||
// .txt file next to the requested output basename.
|
||||
type WhisperCPP struct {
|
||||
// Bin is the whisper.cpp binary name or absolute path.
|
||||
Bin string
|
||||
// Model is the path to a ggml whisper model (.bin).
|
||||
Model string
|
||||
// Language to force; empty means auto-detect.
|
||||
Language string
|
||||
// Threads to use; 0 lets whisper.cpp pick.
|
||||
Threads int
|
||||
// ExtraArgs are appended to the command verbatim.
|
||||
ExtraArgs []string
|
||||
// Verbose enables per-step diagnostic logging to stderr (which probe ran,
|
||||
// which backend was selected, etc.). The selected backend is always logged
|
||||
// on a single stderr line regardless of this flag.
|
||||
Verbose bool
|
||||
}
|
||||
|
||||
func (w *WhisperCPP) Name() string { return "whisper.cpp" }
|
||||
|
||||
func (w *WhisperCPP) Transcribe(ctx context.Context, wavPath string) (string, error) {
|
||||
segs, err := w.TranscribeSegments(ctx, wavPath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return PlainText(segs), nil
|
||||
}
|
||||
|
||||
// TranscribeSegments runs whisper.cpp with JSON output and returns the
|
||||
// per-segment timestamps (in seconds) and text.
|
||||
func (w *WhisperCPP) TranscribeSegments(ctx context.Context, wavPath string) ([]Segment, error) {
|
||||
bin, err := w.resolveBin()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if w.Model == "" {
|
||||
return nil, fmt.Errorf("whisper.cpp model path is required (--whisper-model)")
|
||||
}
|
||||
if _, err := os.Stat(w.Model); err != nil {
|
||||
return nil, fmt.Errorf("whisper model not readable at %s: %w", w.Model, err)
|
||||
}
|
||||
|
||||
dir := filepath.Dir(wavPath)
|
||||
base := strings.TrimSuffix(filepath.Base(wavPath), filepath.Ext(wavPath))
|
||||
outBase := filepath.Join(dir, base)
|
||||
jsonPath := outBase + ".json"
|
||||
_ = os.Remove(jsonPath)
|
||||
|
||||
args := []string{
|
||||
"-m", w.Model,
|
||||
"-f", wavPath,
|
||||
"-oj",
|
||||
"-of", outBase,
|
||||
"--no-prints",
|
||||
}
|
||||
if w.Language != "" {
|
||||
args = append(args, "-l", w.Language)
|
||||
}
|
||||
if w.Threads > 0 {
|
||||
args = append(args, "-t", fmt.Sprintf("%d", w.Threads))
|
||||
}
|
||||
args = append(args, w.ExtraArgs...)
|
||||
|
||||
cmd := exec.CommandContext(ctx, bin, args...)
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("%s: %w", bin, err)
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(jsonPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading whisper json %s: %w", jsonPath, err)
|
||||
}
|
||||
return parseWhisperJSON(data)
|
||||
}
|
||||
|
||||
// gpuBackend describes one accelerated whisper.cpp build we may pick at
|
||||
// runtime. The binary is conventionally installed at ~/.local/bin/<bin> (or
|
||||
// anywhere on PATH); the probe is a fast command that exits 0 only when the
|
||||
// matching GPU runtime is actually usable on this machine.
|
||||
type gpuBackend struct {
|
||||
name string
|
||||
bin string
|
||||
probe []string
|
||||
}
|
||||
|
||||
var gpuBackends = []gpuBackend{
|
||||
{"CUDA", "whisper-cli-cuda", []string{"nvidia-smi", "-L"}},
|
||||
{"ROCm", "whisper-cli-rocm", []string{"rocminfo"}},
|
||||
{"Vulkan", "whisper-cli-vulkan", []string{"vulkaninfo", "--summary"}},
|
||||
}
|
||||
|
||||
func (w *WhisperCPP) resolveBin() (string, error) {
|
||||
if w.Bin != "" {
|
||||
if _, err := exec.LookPath(w.Bin); err == nil {
|
||||
return w.Bin, nil
|
||||
}
|
||||
if _, err := os.Stat(w.Bin); err == nil {
|
||||
return w.Bin, nil
|
||||
}
|
||||
return "", fmt.Errorf("whisper.cpp binary %q not found on PATH", w.Bin)
|
||||
}
|
||||
|
||||
// Metal is always usable on macOS — no separate probe needed; if the
|
||||
// binary exists we trust it.
|
||||
if runtime.GOOS == "darwin" {
|
||||
if path := findBinary("whisper-cli-metal"); path != "" {
|
||||
fmt.Fprintf(os.Stderr, "whisper: using Metal backend (%s)\n", path)
|
||||
return path, nil
|
||||
}
|
||||
}
|
||||
|
||||
for _, b := range gpuBackends {
|
||||
path := findBinary(b.bin)
|
||||
if path == "" {
|
||||
if w.Verbose {
|
||||
fmt.Fprintf(os.Stderr, "whisper: no %s binary (%s) installed; skipping\n", b.name, b.bin)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !probeSucceeds(b.probe) {
|
||||
if w.Verbose {
|
||||
fmt.Fprintf(os.Stderr, "whisper: %s binary present at %s but %s probe failed; trying next\n", b.name, path, b.probe[0])
|
||||
}
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "whisper: using %s backend (%s)\n", b.name, path)
|
||||
return path, nil
|
||||
}
|
||||
|
||||
for _, alt := range []string{"whisper-cli", "whisper-cpp", "main"} {
|
||||
if path, e := exec.LookPath(alt); e == nil {
|
||||
fmt.Fprintf(os.Stderr, "whisper: using CPU backend (%s)\n", path)
|
||||
return path, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no whisper.cpp binary found (tried GPU builds whisper-cli-{cuda,rocm,vulkan} in ~/.local/bin and PATH, then CPU whisper-cli/whisper-cpp/main on PATH); pass --whisper-bin")
|
||||
}
|
||||
|
||||
// findBinary looks for an executable first in ~/.local/bin (the convention
|
||||
// for hand-built backends), then on PATH. Returns "" if neither has it.
|
||||
func findBinary(name string) string {
|
||||
if home, err := os.UserHomeDir(); err == nil {
|
||||
candidate := filepath.Join(home, ".local", "bin", name)
|
||||
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
if path, err := exec.LookPath(name); err == nil {
|
||||
return path
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// probeSucceeds runs the probe with a short timeout and reports whether it
|
||||
// exited 0. Used to confirm the GPU runtime is actually usable before we
|
||||
// commit to its whisper-cli build.
|
||||
func probeSucceeds(argv []string) bool {
|
||||
if _, err := exec.LookPath(argv[0]); err != nil {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
|
||||
return cmd.Run() == nil
|
||||
}
|
||||
|
||||
// whisperJSONFile mirrors the structure whisper.cpp writes with -oj.
|
||||
type whisperJSONFile struct {
|
||||
Transcription []struct {
|
||||
Offsets struct {
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
} `json:"offsets"`
|
||||
Text string `json:"text"`
|
||||
} `json:"transcription"`
|
||||
}
|
||||
|
||||
func parseWhisperJSON(data []byte) ([]Segment, error) {
|
||||
var f whisperJSONFile
|
||||
if err := json.Unmarshal(data, &f); err != nil {
|
||||
return nil, fmt.Errorf("parsing whisper JSON: %w", err)
|
||||
}
|
||||
if len(f.Transcription) == 0 {
|
||||
return nil, fmt.Errorf("whisper produced no transcription segments")
|
||||
}
|
||||
out := make([]Segment, 0, len(f.Transcription))
|
||||
for _, s := range f.Transcription {
|
||||
out = append(out, Segment{
|
||||
Start: float64(s.Offsets.From) / 1000.0,
|
||||
End: float64(s.Offsets.To) / 1000.0,
|
||||
Text: s.Text,
|
||||
})
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
Reference in New Issue
Block a user