// Package clip selects the best 60–90s window from a timestamped transcript // (using a Summarizer to do the picking) and runs ffmpeg to cut that window // out of the original media. package clip import ( "context" "encoding/json" "fmt" "os" "os/exec" "path/filepath" "strings" "publish/internal/summarize" "publish/internal/transcribe" ) // Selection is the LLM's chosen clip window plus metadata. type Selection struct { StartSeconds float64 `json:"start_seconds"` EndSeconds float64 `json:"end_seconds"` Title string `json:"title"` Hook string `json:"hook"` Quote string `json:"quote"` Reasoning string `json:"reasoning"` } // Duration returns the selected window length in seconds. func (s Selection) Duration() float64 { return s.EndSeconds - s.StartSeconds } // Pick asks the summarizer to choose the best window in the given segments, // using promptTemplate (which may contain {{MIN_SECONDS}} / {{MAX_SECONDS}} // placeholders). It clamps and validates the returned window against minSec // and maxSec. func Pick(ctx context.Context, sum summarize.Summarizer, promptTemplate string, segs []transcribe.Segment, minSec, maxSec float64) (Selection, string, error) { if len(segs) == 0 { return Selection{}, "", fmt.Errorf("no transcript segments to choose from") } prompt := strings.NewReplacer( "{{MIN_SECONDS}}", fmt.Sprintf("%g", minSec), "{{MAX_SECONDS}}", fmt.Sprintf("%g", maxSec), ).Replace(promptTemplate) body := transcribe.FormatForLLM(segs) raw, err := sum.Summarize(ctx, prompt, body) if err != nil { return Selection{}, "", err } jsonText, err := extractJSONObject(raw) if err != nil { return Selection{}, raw, fmt.Errorf("could not find JSON object in model output: %w", err) } var sel Selection if err := json.Unmarshal([]byte(jsonText), &sel); err != nil { return Selection{}, raw, fmt.Errorf("parsing selection JSON: %w\n--- raw ---\n%s", err, jsonText) } if err := validate(&sel, segs, minSec, maxSec); err != nil { return sel, raw, err } return sel, raw, nil } func validate(sel *Selection, segs []transcribe.Segment, minSec, maxSec float64) error { if sel.EndSeconds <= sel.StartSeconds { return fmt.Errorf("invalid window: end (%g) <= start (%g)", sel.EndSeconds, sel.StartSeconds) } maxEnd := segs[len(segs)-1].End if sel.StartSeconds < 0 || sel.EndSeconds > maxEnd+1.0 { return fmt.Errorf("window [%g, %g] is outside transcript bounds [0, %g]", sel.StartSeconds, sel.EndSeconds, maxEnd) } dur := sel.Duration() // Allow small slop on either side; otherwise reject. if dur < minSec-2 || dur > maxSec+2 { return fmt.Errorf("window duration %.1fs is outside requested bounds [%g, %g]", dur, minSec, maxSec) } return nil } // extractJSONObject pulls the first balanced {...} object out of s, ignoring // braces that appear inside JSON strings. Useful when the model wraps its // answer in prose despite being told not to. func extractJSONObject(s string) (string, error) { start := strings.Index(s, "{") if start < 0 { return "", fmt.Errorf("no '{' in response") } depth := 0 inStr := false esc := false for i := start; i < len(s); i++ { c := s[i] if inStr { switch { case esc: esc = false case c == '\\': esc = true case c == '"': inStr = false } continue } switch c { case '"': inStr = true case '{': depth++ case '}': depth-- if depth == 0 { return s[start : i+1], nil } } } return "", fmt.Errorf("unbalanced braces") } // portraitFilter center-crops any source aspect ratio to a 9:16 sub-rectangle // (no distortion, just cropping) and scales to 1080x1920. The min() expressions // pick the largest 9:16 box that fits inside the source: 16:9 sources lose the // left/right edges, 9:16 sources are unchanged, and 4:3 / 1:1 sources crop the // sides. setsar=1 forces square pixels. const portraitFilter = `crop=min(iw\,ih*9/16):min(ih\,iw*16/9),scale=1080:1920,setsar=1` // MaxClipBytes is the hard size ceiling enforced by ffmpeg's -fs flag. // Realistic 60–90s 1080x1920 H.264 clips at CRF 23 land 30–100 MB, so this is // a safety cap rather than a target. const MaxClipBytes = 1 << 30 // 1 GiB // Extract runs ffmpeg to cut [start, end) seconds out of input into outPath. // For video inputs, the clip is re-encoded as a 1080x1920 portrait (9:16 // center-crop) under a 1 GiB size cap. If reencode is false, stream copy is // used (fast, keyframe-aligned, but the source aspect ratio is preserved). func Extract(ctx context.Context, input string, sel Selection, outPath string, reencode bool) error { if _, err := exec.LookPath("ffmpeg"); err != nil { return fmt.Errorf("ffmpeg not on PATH: %w", err) } if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil { return err } dur := sel.EndSeconds - sel.StartSeconds args := []string{ "-y", "-loglevel", "error", "-ss", fmt.Sprintf("%.3f", sel.StartSeconds), "-i", input, "-t", fmt.Sprintf("%.3f", dur), } if reencode { if hasVideoExt(input) { args = append(args, "-vf", portraitFilter, "-c:v", "libx264", "-preset", "fast", "-crf", "23", "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", ) } else { args = append(args, "-vn", "-c:a", "aac", "-b:a", "128k", ) } } else { args = append(args, "-c", "copy") } args = append(args, "-fs", fmt.Sprintf("%d", MaxClipBytes), outPath) cmd := exec.CommandContext(ctx, "ffmpeg", args...) cmd.Stdout = os.Stderr cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { return fmt.Errorf("ffmpeg cut: %w", err) } return nil } func hasVideoExt(p string) bool { switch strings.ToLower(filepath.Ext(p)) { case ".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v", ".flv", ".ts": return true } return false } // DefaultOutputPath builds .clip for video inputs and // .m4a for audio inputs. func DefaultOutputPath(input string) string { base := strings.TrimSuffix(input, filepath.Ext(input)) if hasVideoExt(input) { return base + ".clip" + filepath.Ext(input) } return base + ".clip.m4a" }