Files
Summerize/internal/clip/clip.go
2026-05-10 13:37:17 -06:00

205 lines
6.0 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package clip selects the best 6090s window from a timestamped transcript
// (using a Summarizer to do the picking) and runs ffmpeg to cut that window
// out of the original media.
package clip
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"publish/internal/summarize"
"publish/internal/transcribe"
)
// Selection is the LLM's chosen clip window plus metadata.
type Selection struct {
StartSeconds float64 `json:"start_seconds"`
EndSeconds float64 `json:"end_seconds"`
Title string `json:"title"`
Hook string `json:"hook"`
Quote string `json:"quote"`
Reasoning string `json:"reasoning"`
}
// Duration returns the selected window length in seconds.
func (s Selection) Duration() float64 { return s.EndSeconds - s.StartSeconds }
// Pick asks the summarizer to choose the best window in the given segments,
// using promptTemplate (which may contain {{MIN_SECONDS}} / {{MAX_SECONDS}}
// placeholders). It clamps and validates the returned window against minSec
// and maxSec.
func Pick(ctx context.Context, sum summarize.Summarizer, promptTemplate string, segs []transcribe.Segment, minSec, maxSec float64) (Selection, string, error) {
if len(segs) == 0 {
return Selection{}, "", fmt.Errorf("no transcript segments to choose from")
}
prompt := strings.NewReplacer(
"{{MIN_SECONDS}}", fmt.Sprintf("%g", minSec),
"{{MAX_SECONDS}}", fmt.Sprintf("%g", maxSec),
).Replace(promptTemplate)
body := transcribe.FormatForLLM(segs)
raw, err := sum.Summarize(ctx, prompt, body)
if err != nil {
return Selection{}, "", err
}
jsonText, err := extractJSONObject(raw)
if err != nil {
return Selection{}, raw, fmt.Errorf("could not find JSON object in model output: %w", err)
}
var sel Selection
if err := json.Unmarshal([]byte(jsonText), &sel); err != nil {
return Selection{}, raw, fmt.Errorf("parsing selection JSON: %w\n--- raw ---\n%s", err, jsonText)
}
if err := validate(&sel, segs, minSec, maxSec); err != nil {
return sel, raw, err
}
return sel, raw, nil
}
func validate(sel *Selection, segs []transcribe.Segment, minSec, maxSec float64) error {
if sel.EndSeconds <= sel.StartSeconds {
return fmt.Errorf("invalid window: end (%g) <= start (%g)", sel.EndSeconds, sel.StartSeconds)
}
maxEnd := segs[len(segs)-1].End
if sel.StartSeconds < 0 || sel.EndSeconds > maxEnd+1.0 {
return fmt.Errorf("window [%g, %g] is outside transcript bounds [0, %g]",
sel.StartSeconds, sel.EndSeconds, maxEnd)
}
dur := sel.Duration()
// Allow small slop on either side; otherwise reject.
if dur < minSec-2 || dur > maxSec+2 {
return fmt.Errorf("window duration %.1fs is outside requested bounds [%g, %g]",
dur, minSec, maxSec)
}
return nil
}
// extractJSONObject pulls the first balanced {...} object out of s, ignoring
// braces that appear inside JSON strings. Useful when the model wraps its
// answer in prose despite being told not to.
func extractJSONObject(s string) (string, error) {
start := strings.Index(s, "{")
if start < 0 {
return "", fmt.Errorf("no '{' in response")
}
depth := 0
inStr := false
esc := false
for i := start; i < len(s); i++ {
c := s[i]
if inStr {
switch {
case esc:
esc = false
case c == '\\':
esc = true
case c == '"':
inStr = false
}
continue
}
switch c {
case '"':
inStr = true
case '{':
depth++
case '}':
depth--
if depth == 0 {
return s[start : i+1], nil
}
}
}
return "", fmt.Errorf("unbalanced braces")
}
// portraitFilter center-crops any source aspect ratio to a 9:16 sub-rectangle
// (no distortion, just cropping) and scales to 1080x1920. The min() expressions
// pick the largest 9:16 box that fits inside the source: 16:9 sources lose the
// left/right edges, 9:16 sources are unchanged, and 4:3 / 1:1 sources crop the
// sides. setsar=1 forces square pixels.
const portraitFilter = `crop=min(iw\,ih*9/16):min(ih\,iw*16/9),scale=1080:1920,setsar=1`
// MaxClipBytes is the hard size ceiling enforced by ffmpeg's -fs flag.
// Realistic 6090s 1080x1920 H.264 clips at CRF 23 land 30100 MB, so this is
// a safety cap rather than a target.
const MaxClipBytes = 1 << 30 // 1 GiB
// Extract runs ffmpeg to cut [start, end) seconds out of input into outPath.
// For video inputs, the clip is re-encoded as a 1080x1920 portrait (9:16
// center-crop) under a 1 GiB size cap. If reencode is false, stream copy is
// used (fast, keyframe-aligned, but the source aspect ratio is preserved).
func Extract(ctx context.Context, input string, sel Selection, outPath string, reencode bool) error {
if _, err := exec.LookPath("ffmpeg"); err != nil {
return fmt.Errorf("ffmpeg not on PATH: %w", err)
}
if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil {
return err
}
dur := sel.EndSeconds - sel.StartSeconds
args := []string{
"-y",
"-loglevel", "error",
"-ss", fmt.Sprintf("%.3f", sel.StartSeconds),
"-i", input,
"-t", fmt.Sprintf("%.3f", dur),
}
if reencode {
if hasVideoExt(input) {
args = append(args,
"-vf", portraitFilter,
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-c:a", "aac",
"-b:a", "128k",
"-movflags", "+faststart",
)
} else {
args = append(args,
"-vn",
"-c:a", "aac",
"-b:a", "128k",
)
}
} else {
args = append(args, "-c", "copy")
}
args = append(args, "-fs", fmt.Sprintf("%d", MaxClipBytes), outPath)
cmd := exec.CommandContext(ctx, "ffmpeg", args...)
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return fmt.Errorf("ffmpeg cut: %w", err)
}
return nil
}
func hasVideoExt(p string) bool {
switch strings.ToLower(filepath.Ext(p)) {
case ".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v", ".flv", ".ts":
return true
}
return false
}
// DefaultOutputPath builds <input-without-ext>.clip<ext> for video inputs and
// .m4a for audio inputs.
func DefaultOutputPath(input string) string {
base := strings.TrimSuffix(input, filepath.Ext(input))
if hasVideoExt(input) {
return base + ".clip" + filepath.Ext(input)
}
return base + ".clip.m4a"
}