205 lines
6.0 KiB
Go
205 lines
6.0 KiB
Go
// Package clip selects the best 60–90s window from a timestamped transcript
|
||
// (using a Summarizer to do the picking) and runs ffmpeg to cut that window
|
||
// out of the original media.
|
||
package clip
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"strings"
|
||
|
||
"publish/internal/summarize"
|
||
"publish/internal/transcribe"
|
||
)
|
||
|
||
// Selection is the LLM's chosen clip window plus metadata.
|
||
type Selection struct {
|
||
StartSeconds float64 `json:"start_seconds"`
|
||
EndSeconds float64 `json:"end_seconds"`
|
||
Title string `json:"title"`
|
||
Hook string `json:"hook"`
|
||
Quote string `json:"quote"`
|
||
Reasoning string `json:"reasoning"`
|
||
}
|
||
|
||
// Duration returns the selected window length in seconds.
|
||
func (s Selection) Duration() float64 { return s.EndSeconds - s.StartSeconds }
|
||
|
||
// Pick asks the summarizer to choose the best window in the given segments,
|
||
// using promptTemplate (which may contain {{MIN_SECONDS}} / {{MAX_SECONDS}}
|
||
// placeholders). It clamps and validates the returned window against minSec
|
||
// and maxSec.
|
||
func Pick(ctx context.Context, sum summarize.Summarizer, promptTemplate string, segs []transcribe.Segment, minSec, maxSec float64) (Selection, string, error) {
|
||
if len(segs) == 0 {
|
||
return Selection{}, "", fmt.Errorf("no transcript segments to choose from")
|
||
}
|
||
prompt := strings.NewReplacer(
|
||
"{{MIN_SECONDS}}", fmt.Sprintf("%g", minSec),
|
||
"{{MAX_SECONDS}}", fmt.Sprintf("%g", maxSec),
|
||
).Replace(promptTemplate)
|
||
|
||
body := transcribe.FormatForLLM(segs)
|
||
|
||
raw, err := sum.Summarize(ctx, prompt, body)
|
||
if err != nil {
|
||
return Selection{}, "", err
|
||
}
|
||
|
||
jsonText, err := extractJSONObject(raw)
|
||
if err != nil {
|
||
return Selection{}, raw, fmt.Errorf("could not find JSON object in model output: %w", err)
|
||
}
|
||
var sel Selection
|
||
if err := json.Unmarshal([]byte(jsonText), &sel); err != nil {
|
||
return Selection{}, raw, fmt.Errorf("parsing selection JSON: %w\n--- raw ---\n%s", err, jsonText)
|
||
}
|
||
|
||
if err := validate(&sel, segs, minSec, maxSec); err != nil {
|
||
return sel, raw, err
|
||
}
|
||
return sel, raw, nil
|
||
}
|
||
|
||
func validate(sel *Selection, segs []transcribe.Segment, minSec, maxSec float64) error {
|
||
if sel.EndSeconds <= sel.StartSeconds {
|
||
return fmt.Errorf("invalid window: end (%g) <= start (%g)", sel.EndSeconds, sel.StartSeconds)
|
||
}
|
||
maxEnd := segs[len(segs)-1].End
|
||
if sel.StartSeconds < 0 || sel.EndSeconds > maxEnd+1.0 {
|
||
return fmt.Errorf("window [%g, %g] is outside transcript bounds [0, %g]",
|
||
sel.StartSeconds, sel.EndSeconds, maxEnd)
|
||
}
|
||
dur := sel.Duration()
|
||
// Allow small slop on either side; otherwise reject.
|
||
if dur < minSec-2 || dur > maxSec+2 {
|
||
return fmt.Errorf("window duration %.1fs is outside requested bounds [%g, %g]",
|
||
dur, minSec, maxSec)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// extractJSONObject pulls the first balanced {...} object out of s, ignoring
|
||
// braces that appear inside JSON strings. Useful when the model wraps its
|
||
// answer in prose despite being told not to.
|
||
func extractJSONObject(s string) (string, error) {
|
||
start := strings.Index(s, "{")
|
||
if start < 0 {
|
||
return "", fmt.Errorf("no '{' in response")
|
||
}
|
||
depth := 0
|
||
inStr := false
|
||
esc := false
|
||
for i := start; i < len(s); i++ {
|
||
c := s[i]
|
||
if inStr {
|
||
switch {
|
||
case esc:
|
||
esc = false
|
||
case c == '\\':
|
||
esc = true
|
||
case c == '"':
|
||
inStr = false
|
||
}
|
||
continue
|
||
}
|
||
switch c {
|
||
case '"':
|
||
inStr = true
|
||
case '{':
|
||
depth++
|
||
case '}':
|
||
depth--
|
||
if depth == 0 {
|
||
return s[start : i+1], nil
|
||
}
|
||
}
|
||
}
|
||
return "", fmt.Errorf("unbalanced braces")
|
||
}
|
||
|
||
// portraitFilter center-crops any source aspect ratio to a 9:16 sub-rectangle
|
||
// (no distortion, just cropping) and scales to 1080x1920. The min() expressions
|
||
// pick the largest 9:16 box that fits inside the source: 16:9 sources lose the
|
||
// left/right edges, 9:16 sources are unchanged, and 4:3 / 1:1 sources crop the
|
||
// sides. setsar=1 forces square pixels.
|
||
const portraitFilter = `crop=min(iw\,ih*9/16):min(ih\,iw*16/9),scale=1080:1920,setsar=1`
|
||
|
||
// MaxClipBytes is the hard size ceiling enforced by ffmpeg's -fs flag.
|
||
// Realistic 60–90s 1080x1920 H.264 clips at CRF 23 land 30–100 MB, so this is
|
||
// a safety cap rather than a target.
|
||
const MaxClipBytes = 1 << 30 // 1 GiB
|
||
|
||
// Extract runs ffmpeg to cut [start, end) seconds out of input into outPath.
|
||
// For video inputs, the clip is re-encoded as a 1080x1920 portrait (9:16
|
||
// center-crop) under a 1 GiB size cap. If reencode is false, stream copy is
|
||
// used (fast, keyframe-aligned, but the source aspect ratio is preserved).
|
||
func Extract(ctx context.Context, input string, sel Selection, outPath string, reencode bool) error {
|
||
if _, err := exec.LookPath("ffmpeg"); err != nil {
|
||
return fmt.Errorf("ffmpeg not on PATH: %w", err)
|
||
}
|
||
if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil {
|
||
return err
|
||
}
|
||
|
||
dur := sel.EndSeconds - sel.StartSeconds
|
||
args := []string{
|
||
"-y",
|
||
"-loglevel", "error",
|
||
"-ss", fmt.Sprintf("%.3f", sel.StartSeconds),
|
||
"-i", input,
|
||
"-t", fmt.Sprintf("%.3f", dur),
|
||
}
|
||
if reencode {
|
||
if hasVideoExt(input) {
|
||
args = append(args,
|
||
"-vf", portraitFilter,
|
||
"-c:v", "libx264",
|
||
"-preset", "fast",
|
||
"-crf", "23",
|
||
"-c:a", "aac",
|
||
"-b:a", "128k",
|
||
"-movflags", "+faststart",
|
||
)
|
||
} else {
|
||
args = append(args,
|
||
"-vn",
|
||
"-c:a", "aac",
|
||
"-b:a", "128k",
|
||
)
|
||
}
|
||
} else {
|
||
args = append(args, "-c", "copy")
|
||
}
|
||
args = append(args, "-fs", fmt.Sprintf("%d", MaxClipBytes), outPath)
|
||
|
||
cmd := exec.CommandContext(ctx, "ffmpeg", args...)
|
||
cmd.Stdout = os.Stderr
|
||
cmd.Stderr = os.Stderr
|
||
if err := cmd.Run(); err != nil {
|
||
return fmt.Errorf("ffmpeg cut: %w", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func hasVideoExt(p string) bool {
|
||
switch strings.ToLower(filepath.Ext(p)) {
|
||
case ".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v", ".flv", ".ts":
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
// DefaultOutputPath builds <input-without-ext>.clip<ext> for video inputs and
|
||
// .m4a for audio inputs.
|
||
func DefaultOutputPath(input string) string {
|
||
base := strings.TrimSuffix(input, filepath.Ext(input))
|
||
if hasVideoExt(input) {
|
||
return base + ".clip" + filepath.Ext(input)
|
||
}
|
||
return base + ".clip.m4a"
|
||
}
|