aboutsummaryrefslogtreecommitdiff
path: root/src/subtitle_extraction
diff options
context:
space:
mode:
authorMalte Voos <git@mal.tc>2025-11-14 15:30:49 +0100
committerMalte Voos <git@mal.tc>2025-11-14 15:30:49 +0100
commita8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e (patch)
tree542b42d3316138043272faba42e0d1005f8403b6 /src/subtitle_extraction
parenta42a73378b7c527a5e4600544b2d7a86d68c5aac (diff)
downloadlleap-a8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e.tar.gz
lleap-a8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e.zip
implement file/url open dialog
Diffstat (limited to 'src/subtitle_extraction')
-rw-r--r--src/subtitle_extraction/embedded.rs118
-rw-r--r--src/subtitle_extraction/mod.rs159
-rw-r--r--src/subtitle_extraction/whisper.rs75
3 files changed, 352 insertions, 0 deletions
diff --git a/src/subtitle_extraction/embedded.rs b/src/subtitle_extraction/embedded.rs
new file mode 100644
index 0000000..5cdf813
--- /dev/null
+++ b/src/subtitle_extraction/embedded.rs
@@ -0,0 +1,118 @@
+use std::sync::mpsc;
+
+use anyhow::Context;
+
+use crate::subtitle_extraction::*;
+
+pub fn extract_embedded_subtitles(
+ // stream index to use when storing extracted subtitles, this index already
+ // has to be in TRACKS when this function is called!
+ stream_ix: StreamIndex,
+ context: ffmpeg::codec::Context,
+ time_base: ffmpeg::Rational,
+ packet_rx: mpsc::Receiver<ffmpeg::Packet>,
+ sender: ComponentSender<SubtitleExtractor>,
+) -> anyhow::Result<()> {
+ let mut decoder = context
+ .decoder()
+ .subtitle()
+ .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
+
+ while let Ok(packet) = packet_rx.recv() {
+ let mut subtitle = ffmpeg::Subtitle::new();
+ match decoder.decode(&packet, &mut subtitle) {
+ Ok(true) => {
+ if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) {
+ SUBTITLE_TRACKS
+ .write()
+ .get_mut(&stream_ix)
+ .unwrap()
+ .cues
+ .push(cue.clone());
+ sender
+ .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
+ .unwrap();
+ } else {
+ log::error!("error parsing subtitle at pts {:?}", packet.pts())
+ }
+ }
+ Ok(false) => {
+ log::debug!("got empty (?) subtitle, not sure if this should ever happen");
+ }
+ Err(e) => {
+ log::error!("error decoding subtitle: {:?}", e)
+ }
+ }
+ }
+
+ Ok(())
+}
+
+fn parse_subtitle(
+ subtitle: &ffmpeg::Subtitle,
+ packet: &ffmpeg::Packet,
+ time_base: Rational,
+) -> Option<SubtitleCue> {
+ let time_to_clock_time = |time: i64| {
+ let nseconds: i64 =
+ (time * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
+ gst::ClockTime::from_nseconds(nseconds as u64)
+ };
+
+ let text = subtitle
+ .rects()
+ .into_iter()
+ .map(|rect| match rect {
+ ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
+ ffmpeg::subtitle::Rect::Ass(ass) => {
+ extract_dialogue_text(ass.get()).unwrap_or(String::new())
+ }
+ _ => String::new(),
+ })
+ .collect::<Vec<String>>()
+ .join("\n— ");
+
+ let start = time_to_clock_time(packet.pts()?);
+ let end = time_to_clock_time(packet.pts()? + packet.duration());
+
+ Some(SubtitleCue { start, end, text })
+}
+
+fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
+ // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
+ // we need the 9th field (Text), so split on comma but only take first 9 splits
+ // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
+ let text = dialogue_line.splitn(9, ',').last()?;
+
+ // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
+ let mut result = String::new();
+ let mut in_tag = false;
+ let mut char_iter = text.chars().peekable();
+
+ while let Some(c) = char_iter.next() {
+ if c == '{' && char_iter.peek() == Some(&'\\') {
+ in_tag = true;
+ } else if c == '}' {
+ in_tag = false;
+ } else if !in_tag {
+ // process line breaks and hard spaces
+ if c == '\\' {
+ match char_iter.peek() {
+ Some(&'N') => {
+ char_iter.next();
+ result.push('\n');
+ }
+ Some(&'n') | Some(&'h') => {
+ char_iter.next();
+ result.push(' ');
+ }
+ _ => result.push(c),
+ }
+ } else {
+ result.push(c);
+ }
+ }
+ }
+
+ Some(result)
+}
diff --git a/src/subtitle_extraction/mod.rs b/src/subtitle_extraction/mod.rs
new file mode 100644
index 0000000..9e7fff4
--- /dev/null
+++ b/src/subtitle_extraction/mod.rs
@@ -0,0 +1,159 @@
+/// Extraction of embedded subtitles
+mod embedded;
+/// Synthesis of subtitles from audio using whisper.cpp
+mod whisper;
+
+use std::{collections::BTreeMap, sync::mpsc, thread};
+
+use ffmpeg::Rational;
+use relm4::{ComponentSender, Worker};
+
+use crate::tracks::{SUBTITLE_TRACKS, StreamIndex, SubtitleCue, SubtitleTrack, TrackMetadata};
+
+pub struct SubtitleExtractor {}
+
+#[derive(Debug)]
+pub enum SubtitleExtractorMsg {
+ ExtractFromUrl {
+ url: String,
+ // the index of the audio stream on which to run a whisper transcription
+ whisper_stream_index: Option<usize>,
+ },
+}
+
+#[derive(Debug)]
+pub enum SubtitleExtractorOutput {
+ NewCue(StreamIndex, SubtitleCue),
+ ExtractionComplete,
+}
+
+impl Worker for SubtitleExtractor {
+ type Init = ();
+ type Input = SubtitleExtractorMsg;
+ type Output = SubtitleExtractorOutput;
+
+ fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self {
+ Self {}
+ }
+
+ fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) {
+ match msg {
+ SubtitleExtractorMsg::ExtractFromUrl {
+ url,
+ whisper_stream_index: whisper_audio_stream_ix,
+ } => {
+ self.handle_extract_from_url(url, whisper_audio_stream_ix, sender);
+ }
+ }
+ }
+}
+
+impl SubtitleExtractor {
+ fn handle_extract_from_url(
+ &mut self,
+ url: String,
+ whisper_audio_stream_ix: Option<usize>,
+ sender: ComponentSender<Self>,
+ ) {
+ // Clear existing tracks
+ SUBTITLE_TRACKS.write().clear();
+
+ match self.extract_subtitles(&url, whisper_audio_stream_ix, sender.clone()) {
+ Ok(_) => {
+ log::info!("Subtitle extraction completed successfully");
+ sender
+ .output(SubtitleExtractorOutput::ExtractionComplete)
+ .unwrap();
+ }
+ Err(e) => {
+ log::error!("Subtitle extraction failed: {}", e);
+ }
+ }
+ }
+
+ fn extract_subtitles(
+ &self,
+ url: &str,
+ whisper_audio_stream_ix: Option<usize>,
+ sender: ComponentSender<Self>,
+ ) -> anyhow::Result<()> {
+ let mut input = ffmpeg::format::input(&url)?;
+
+ let mut subtitle_extractors = BTreeMap::new();
+
+ // create extractor for each subtitle stream
+ for stream in input.streams() {
+ let stream_ix = stream.index();
+
+ if stream.parameters().medium() == ffmpeg::media::Type::Subtitle {
+ let metadata = TrackMetadata::from_ffmpeg_stream(&stream);
+ let track = SubtitleTrack {
+ metadata,
+ cues: Vec::new(),
+ };
+
+ SUBTITLE_TRACKS.write().insert(stream_ix, track);
+
+ let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?;
+ let (packet_tx, packet_rx) = mpsc::channel();
+ let time_base = stream.time_base();
+ let sender = sender.clone();
+ let join_handle = thread::spawn(move || {
+ embedded::extract_embedded_subtitles(
+ stream_ix, context, time_base, packet_rx, sender,
+ )
+ });
+
+ subtitle_extractors.insert(stream_ix, (packet_tx, join_handle));
+ }
+ }
+
+ if let Some(stream_ix) = whisper_audio_stream_ix {
+ let stream = input.stream(stream_ix).unwrap();
+
+ let mut metadata = TrackMetadata::from_ffmpeg_stream(&stream);
+ metadata.title = Some(match metadata.title {
+ Some(title) => format!("Auto-generated from audio (Whisper): {}", title),
+ None => "Auto-generated from audio (Whisper)".to_string(),
+ });
+
+ let track = SubtitleTrack {
+ metadata,
+ cues: Vec::new(),
+ };
+
+ SUBTITLE_TRACKS.write().insert(stream_ix, track);
+
+ let context = ffmpeg::codec::Context::from_parameters(stream.parameters())?;
+ let (packet_tx, packet_rx) = mpsc::channel();
+ let time_base = stream.time_base();
+ let sender = sender.clone();
+ let join_handle = thread::spawn(move || {
+ whisper::generate_whisper_subtitles(
+ stream_ix, context, time_base, packet_rx, sender,
+ )
+ });
+
+ subtitle_extractors.insert(stream_ix, (packet_tx, join_handle));
+ }
+
+ // process packets
+ for (stream, packet) in input.packets() {
+ let stream_index = stream.index();
+
+ if let Some((packet_tx, _)) = subtitle_extractors.get_mut(&stream_index) {
+ packet_tx.send(packet).unwrap();
+ }
+ }
+
+ // wait for extraction to complete
+ for (_, (_, join_handle)) in subtitle_extractors {
+ join_handle
+ .join()
+ .unwrap()
+ .unwrap_or_else(|e| log::error!("error running subtitle extraction: {}", e));
+ }
+
+ Ok(())
+ }
+}
diff --git a/src/subtitle_extraction/whisper.rs b/src/subtitle_extraction/whisper.rs
new file mode 100644
index 0000000..5622d6f
--- /dev/null
+++ b/src/subtitle_extraction/whisper.rs
@@ -0,0 +1,75 @@
+use std::sync::mpsc;
+
+use anyhow::Context;
+use ffmpeg::filter;
+
+use crate::{subtitle_extraction::*, tracks::StreamIndex};
+
+pub fn generate_whisper_subtitles(
+ // stream index to use when storing generated subtitles, this index
+ // already has to be in TRACKS when this function is called!
+ stream_ix: StreamIndex,
+ context: ffmpeg::codec::Context,
+ time_base: ffmpeg::Rational,
+ packet_rx: mpsc::Receiver<ffmpeg::Packet>,
+ sender: ComponentSender<SubtitleExtractor>,
+) -> anyhow::Result<()> {
+ let mut decoder = context
+ .decoder()
+ .audio()
+ .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
+
+ let mut filter = filter::Graph::new();
+
+ let abuffer_args = format!(
+ "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
+ decoder.time_base(),
+ decoder.rate(),
+ decoder.format().name(),
+ decoder.channel_layout().bits()
+ );
+ let whisper_args = format!(
+ "model={}:queue={}:format=json",
+ "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", 30
+ );
+ let filter_spec = format!("[src] whisper={} [sink]", whisper_args);
+
+ filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?;
+ filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?;
+ filter
+ .output("src", 0)?
+ .input("sink", 0)?
+ .parse(&filter_spec)?;
+ filter.validate()?;
+
+ let mut source_ctx = filter.get("src").unwrap();
+ let mut sink_ctx = filter.get("sink").unwrap();
+
+ while let Ok(packet) = packet_rx.recv() {
+ handle_packet(&mut decoder, source_ctx.source(), sink_ctx.sink(), packet)
+ .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
+ }
+
+ Ok(())
+}
+
+fn handle_packet(
+ decoder: &mut ffmpeg::decoder::Audio,
+ mut source: filter::Source,
+ mut sink: filter::Sink,
+ packet: ffmpeg::Packet,
+) -> anyhow::Result<()> {
+ let mut in_frame = unsafe { ffmpeg::Frame::empty() };
+ decoder.send_packet(&packet)?;
+ decoder.receive_frame(&mut in_frame)?;
+ source.add(&in_frame)?;
+
+ let mut out_frame = unsafe { ffmpeg::Frame::empty() };
+ sink.frame(&mut out_frame)?;
+
+ if let Some(text) = out_frame.metadata().get("lavfi.whisper.text") {
+ println!("{}", text);
+ }
+
+ Ok(())
+}