aboutsummaryrefslogtreecommitdiff
path: root/src/subtitle_extraction
diff options
context:
space:
mode:
authorMalte Voos <git@mal.tc>2025-12-05 15:35:38 +0100
committerMalte Voos <git@mal.tc>2025-12-05 15:43:58 +0100
commitc347b6133365dcf1b7da4e77890b20d04d6cfba4 (patch)
treec83aac6f7d1e6edc57e607f01e5d3eeee8da4a0e /src/subtitle_extraction
parent652b1c2a0ce7db4885ebc51f7f09133a43401442 (diff)
downloadlleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.tar.gz
lleap-c347b6133365dcf1b7da4e77890b20d04d6cfba4.zip
implement machine translation; various fixes and refactorings
Diffstat (limited to 'src/subtitle_extraction')
-rw-r--r--src/subtitle_extraction/embedded.rs118
-rw-r--r--src/subtitle_extraction/mod.rs159
-rw-r--r--src/subtitle_extraction/whisper.rs143
3 files changed, 0 insertions, 420 deletions
diff --git a/src/subtitle_extraction/embedded.rs b/src/subtitle_extraction/embedded.rs
deleted file mode 100644
index 0ba6178..0000000
--- a/src/subtitle_extraction/embedded.rs
+++ /dev/null
@@ -1,118 +0,0 @@
-use std::sync::mpsc;
-
-use anyhow::Context;
-
-use crate::subtitle_extraction::*;
-
-pub fn extract_embedded_subtitles(
- // stream index to use when storing extracted subtitles, this index already
- // has to be in TRACKS when this function is called!
- stream_ix: StreamIndex,
- context: ffmpeg::codec::Context,
- time_base: ffmpeg::Rational,
- packet_rx: mpsc::Receiver<ffmpeg::Packet>,
- sender: ComponentSender<SubtitleExtractor>,
-) -> anyhow::Result<()> {
- let mut decoder = context
- .decoder()
- .subtitle()
- .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
-
- while let Ok(packet) = packet_rx.recv() {
- let mut subtitle = ffmpeg::Subtitle::new();
- match decoder.decode(&packet, &mut subtitle) {
- Ok(true) => {
- if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) {
- SUBTITLE_TRACKS
- .write()
- .get_mut(&stream_ix)
- .unwrap()
- .cues
- .push(cue.clone());
- sender
- .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
- .unwrap();
- } else {
- log::error!("error parsing subtitle at pts {:?}", packet.pts())
- }
- }
- Ok(false) => {
- log::debug!("got empty (?) subtitle, not sure if this should ever happen");
- }
- Err(e) => {
- log::error!("error decoding subtitle: {:?}", e)
- }
- }
- }
-
- Ok(())
-}
-
-fn parse_subtitle(
- subtitle: &ffmpeg::Subtitle,
- packet: &ffmpeg::Packet,
- time_base: Rational,
-) -> Option<SubtitleCue> {
- let pts_to_clock_time = |pts: i64| {
- let nseconds: i64 =
- (pts * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
- gst::ClockTime::from_nseconds(nseconds as u64)
- };
-
- let text = subtitle
- .rects()
- .into_iter()
- .map(|rect| match rect {
- ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
- ffmpeg::subtitle::Rect::Ass(ass) => {
- extract_dialogue_text(ass.get()).unwrap_or(String::new())
- }
- _ => String::new(),
- })
- .collect::<Vec<String>>()
- .join("\n— ");
-
- let start = pts_to_clock_time(packet.pts()?);
- let end = pts_to_clock_time(packet.pts()? + packet.duration());
-
- Some(SubtitleCue { start, end, text })
-}
-
-fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
- // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
- // we need the 9th field (Text), so split on comma but only take first 9 splits
- // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
- let text = dialogue_line.splitn(9, ',').last()?;
-
- // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
- let mut result = String::new();
- let mut in_tag = false;
- let mut char_iter = text.chars().peekable();
-
- while let Some(c) = char_iter.next() {
- if c == '{' && char_iter.peek() == Some(&'\\') {
- in_tag = true;
- } else if c == '}' {
- in_tag = false;
- } else if !in_tag {
- // process line breaks and hard spaces
- if c == '\\' {
- match char_iter.peek() {
- Some(&'N') => {
- char_iter.next();
- result.push('\n');
- }
- Some(&'n') | Some(&'h') => {
- char_iter.next();
- result.push(' ');
- }
- _ => result.push(c),
- }
- } else {
- result.push(c);
- }
- }
- }
-
- Some(result)
-}
diff --git a/src/subtitle_extraction/mod.rs b/src/subtitle_extraction/mod.rs
deleted file mode 100644
index 9e7fff4..0000000
--- a/src/subtitle_extraction/mod.rs
+++ /dev/null
@@ -1,159 +0,0 @@
-/// Extraction of embedded subtitles
-mod embedded;
-/// Synthesis of subtitles from audio using whisper.cpp
-mod whisper;
-
-use std::{collections::BTreeMap, sync::mpsc, thread};
-
-use ffmpeg::Rational;
-use relm4::{ComponentSender, Worker};
-
-use crate::tracks::{SUBTITLE_TRACKS, StreamIndex, SubtitleCue, SubtitleTrack, TrackMetadata};
-
-pub struct SubtitleExtractor {}
-
-#[derive(Debug)]
-pub enum SubtitleExtractorMsg {
- ExtractFromUrl {
- url: String,
- // the index of the audio stream on which to run a whisper transcription
- whisper_stream_index: Option<usize>,
- },
-}
-
-#[derive(Debug)]
-pub enum SubtitleExtractorOutput {
- NewCue(StreamIndex, SubtitleCue),
- ExtractionComplete,
-}
-
-impl Worker for SubtitleExtractor {
- type Init = ();
- type Input = SubtitleExtractorMsg;
- type Output = SubtitleExtractorOutput;
-
- fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self {
- Self {}
- }
-
- fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) {
- match msg {
- SubtitleExtractorMsg::ExtractFromUrl {
- url,
- whisper_stream_index: whisper_audio_stream_ix,
- } => {
- self.handle_extract_from_url(url, whisper_audio_stream_ix, sender);
- }
- }
- }
-}
-
-impl SubtitleExtractor {
- fn handle_extract_from_url(
- &mut self,
- url: String,
- whisper_audio_stream_ix: Option<usize>,
- sender: ComponentSender<Self>,
- ) {
- // Clear existing tracks
- SUBTITLE_TRACKS.write().clear();
-
- match self.extract_subtitles(&url, whisper_audio_stream_ix, sender.clone()) {
- Ok(_) => {
- log::info!("Subtitle extraction completed successfully");
- sender
- .output(SubtitleExtractorOutput::ExtractionComplete)
- .unwrap();
- }
- Err(e) => {
- log::error!("Subtitle extraction failed: {}", e);
- }
- }
- }
-
- fn extract_subtitles(
- &self,
- url: &str,
- whisper_audio_stream_ix: Option<usize>,
- sender: ComponentSender<Self>,
- ) -> anyhow::Result<()> {
- let mut input = ffmpeg::format::input(&url)?;
-
- let mut subtitle_extractors = BTreeMap::new();
-
- // create extractor for each subtitle stream
- for stream in input.streams() {
- let stream_ix = stream.index();
-
- if stream.parameters().medium() == ffmpeg::media::Type::Subtitle {
- let metadata = TrackMetadata::from_ffmpeg_stream(&stream);
- let track = SubtitleTrack {
- metadata,
- cues: Vec::new(),
- };
-
- SUBTITLE_TRACKS.write().insert(stream_ix, track);
-
- let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?;
- let (packet_tx, packet_rx) = mpsc::channel();
- let time_base = stream.time_base();
- let sender = sender.clone();
- let join_handle = thread::spawn(move || {
- embedded::extract_embedded_subtitles(
- stream_ix, context, time_base, packet_rx, sender,
- )
- });
-
- subtitle_extractors.insert(stream_ix, (packet_tx, join_handle));
- }
- }
-
- if let Some(stream_ix) = whisper_audio_stream_ix {
- let stream = input.stream(stream_ix).unwrap();
-
- let mut metadata = TrackMetadata::from_ffmpeg_stream(&stream);
- metadata.title = Some(match metadata.title {
- Some(title) => format!("Auto-generated from audio (Whisper): {}", title),
- None => "Auto-generated from audio (Whisper)".to_string(),
- });
-
- let track = SubtitleTrack {
- metadata,
- cues: Vec::new(),
- };
-
- SUBTITLE_TRACKS.write().insert(stream_ix, track);
-
- let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?;
- let (packet_tx, packet_rx) = mpsc::channel();
- let time_base = stream.time_base();
- let sender = sender.clone();
- let join_handle = thread::spawn(move || {
- whisper::generate_whisper_subtitles(
- stream_ix, context, time_base, packet_rx, sender,
- )
- });
-
- subtitle_extractors.insert(stream_ix, (packet_tx, join_handle));
- }
-
- // process packets
- for (stream, packet) in input.packets() {
- let stream_index = stream.index();
-
- if let Some((packet_tx, _)) = subtitle_extractors.get_mut(&stream_index) {
- packet_tx.send(packet).unwrap();
- }
- }
-
- // wait for extraction to complete
- for (_, (_, join_handle)) in subtitle_extractors {
- join_handle
- .join()
- .unwrap()
- .unwrap_or_else(|e| log::error!("error running subtitle extraction: {}", e));
- }
-
- Ok(())
- }
-}
diff --git a/src/subtitle_extraction/whisper.rs b/src/subtitle_extraction/whisper.rs
deleted file mode 100644
index ffa2e47..0000000
--- a/src/subtitle_extraction/whisper.rs
+++ /dev/null
@@ -1,143 +0,0 @@
-use std::{
- io::{self, BufRead, BufReader},
- net::{TcpListener, TcpStream},
- sync::mpsc,
-};
-
-use anyhow::Context;
-use ffmpeg::{filter, frame};
-use serde::Deserialize;
-
-use crate::{subtitle_extraction::*, tracks::StreamIndex};
-
-#[derive(Debug, Deserialize)]
-struct WhisperCue {
- start: u64,
- end: u64,
- text: String,
-}
-
-pub fn generate_whisper_subtitles(
- // stream index to use when storing generated subtitles, this index
- // already has to be in TRACKS when this function is called!
- stream_ix: StreamIndex,
- context: ffmpeg::codec::Context,
- time_base: ffmpeg::Rational,
- packet_rx: mpsc::Receiver<ffmpeg::Packet>,
- sender: ComponentSender<SubtitleExtractor>,
-) -> anyhow::Result<()> {
- // FFmpeg's whisper filter will send the generated subtitles to us as JSON
- // objects over a TCP socket. This is the best solution I could find
- // because we need to use one of the protocols in
- // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the
- // list which is portable and supports non-blocking IO in Rust.
- let tcp_listener = TcpListener::bind("127.0.0.1:0")?;
-
- let mut decoder = context
- .decoder()
- .audio()
- .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
-
- let mut filter = filter::Graph::new();
-
- let abuffer_args = format!(
- "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
- time_base,
- decoder.rate(),
- decoder.format().name(),
- decoder.channel_layout().bits()
- );
-
- let whisper_args = format!(
- "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json",
- "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin",
- 30,
- tcp_listener.local_addr()?.port()
- );
- let filter_spec = format!("[src] whisper={} [sink]", whisper_args);
-
- filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?;
- filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?;
- filter
- .output("src", 0)?
- .input("sink", 0)?
- .parse(&filter_spec)?;
- filter.validate()?;
-
- let mut source_ctx = filter.get("src").unwrap();
- let mut sink_ctx = filter.get("sink").unwrap();
-
- let (tcp_stream, _) = tcp_listener.accept()?;
- tcp_stream.set_nonblocking(true)?;
-
- let mut transcript_reader = BufReader::new(tcp_stream);
- let mut line_buf = String::new();
-
- while let Ok(packet) = packet_rx.recv() {
- handle_packet(
- stream_ix,
- &sender,
- &mut decoder,
- source_ctx.source(),
- sink_ctx.sink(),
- &mut transcript_reader,
- &mut line_buf,
- packet,
- )
- .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
- }
-
- Ok(())
-}
-
-// TODO: can we do this without passing all the arguments? this is kinda ugly
-fn handle_packet(
- stream_ix: StreamIndex,
- sender: &ComponentSender<SubtitleExtractor>,
- decoder: &mut ffmpeg::decoder::Audio,
- mut source: filter::Source,
- mut sink: filter::Sink,
- transcript_reader: &mut BufReader<TcpStream>,
- line_buf: &mut String,
- packet: ffmpeg::Packet,
-) -> anyhow::Result<()> {
- decoder.send_packet(&packet)?;
-
- let mut decoded = frame::Audio::empty();
- while decoder.receive_frame(&mut decoded).is_ok() {
- source.add(&decoded)?;
- }
-
- let mut out_frame = frame::Audio::empty();
- while sink.frame(&mut out_frame).is_ok() {}
-
- line_buf.clear();
- match transcript_reader.read_line(line_buf) {
- Ok(_) => {
- let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?;
-
- let cue = SubtitleCue {
- start: gst::ClockTime::from_mseconds(whisper_cue.start),
- end: gst::ClockTime::from_mseconds(whisper_cue.end),
- text: whisper_cue.text,
- };
-
- // TODO deduplicate this vs. the code in embedded.rs
- SUBTITLE_TRACKS
- .write()
- .get_mut(&stream_ix)
- .unwrap()
- .cues
- .push(cue.clone());
- sender
- .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
- .unwrap();
-
- Ok(())
- }
- Err(e) => match e.kind() {
- io::ErrorKind::WouldBlock => Ok(()),
- _ => Err(e)?,
- },
- }
-}