summary refs log tree commit diff
path: root/src/subtitle_extractor_aishit.rs
diff options
context:
space:
mode:
authorMalte Voos <git@mal.tc>2025-11-14 15:30:49 +0100
committerMalte Voos <git@mal.tc>2025-11-14 15:30:49 +0100
commita8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e (patch)
tree542b42d3316138043272faba42e0d1005f8403b6 /src/subtitle_extractor_aishit.rs
parenta42a73378b7c527a5e4600544b2d7a86d68c5aac (diff)
downloadlleap-a8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e.tar.gz
lleap-a8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e.zip
implement file/url open dialog
Diffstat (limited to 'src/subtitle_extractor_aishit.rs')
-rw-r--r--src/subtitle_extractor_aishit.rs732
1 files changed, 732 insertions, 0 deletions
diff --git a/src/subtitle_extractor_aishit.rs b/src/subtitle_extractor_aishit.rs
new file mode 100644
index 0000000..c615f6c
--- /dev/null
+++ b/src/subtitle_extractor_aishit.rs
@@ -0,0 +1,732 @@
+use std::collections::BTreeMap;
+
+use anyhow::Result;
+
+use ffmpeg::Rational;
+use log::{debug, error, info, warn};
+use relm4::{ComponentSender, SharedState, Worker};
+
+pub type StreamIndex = usize;
+
+#[derive(Debug, Clone)]
+pub struct SubtitleCue {
+    pub start: gst::ClockTime,
+    pub end: gst::ClockTime,
+    pub text: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct SubtitleTrack {
+    pub language: Option<isolang::Language>,
+    pub title: Option<String>,
+    pub cues: Vec<SubtitleCue>,
+    pub is_generated: bool, // true if generated from audio
+}
+
+pub static TRACKS: SharedState<BTreeMap<StreamIndex, SubtitleTrack>> = SharedState::new();
+
+pub struct SubtitleExtractor {}
+
+#[derive(Debug)]
+pub enum SubtitleExtractorMsg {
+    ExtractFromUrl(String),
+}
+
+#[derive(Debug)]
+pub enum SubtitleExtractorOutput {
+    NewOrUpdatedTrackMetadata(StreamIndex),
+    NewCue(StreamIndex, SubtitleCue),
+    ExtractionComplete,
+}
+
+impl Worker for SubtitleExtractor {
+    type Init = ();
+    type Input = SubtitleExtractorMsg;
+    type Output = SubtitleExtractorOutput;
+
+    fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self {
+        Self {}
+    }
+
+    fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) {
+        match msg {
+            SubtitleExtractorMsg::ExtractFromUrl(url) => {
+                self.handle_extract_from_url(url, sender);
+            }
+        }
+    }
+}
+
+impl SubtitleExtractor {
+    fn handle_extract_from_url(&mut self, url: String, sender: ComponentSender<Self>) {
+        // Clear existing tracks
+        TRACKS.write().clear();
+
+        // Try to extract subtitles using ffmpeg
+        match self.extract_subtitles_ffmpeg(&url, &sender) {
+            Ok(_) => {
+                info!("Subtitle extraction completed successfully");
+                sender
+                    .output(SubtitleExtractorOutput::ExtractionComplete)
+                    .unwrap();
+            }
+            Err(e) => {
+                error!("FFmpeg extraction failed: {}", e);
+            }
+        }
+    }
+
+    fn extract_subtitles_ffmpeg(&self, url: &str, sender: &ComponentSender<Self>) -> Result<()> {
+        info!("Starting subtitle extraction from: {}", url);
+        let mut input = ffmpeg::format::input(&url)?;
+
+        // Log input format info
+        info!(
+            "Input format: {} ({} streams)",
+            input.format().name(),
+            input.streams().count()
+        );
+
+        // Check if whisper filter is available
+        if let Some(whisper_filter) = ffmpeg::filter::find("whisper") {
+            info!("Whisper filter found: {}", whisper_filter.name());
+        } else {
+            warn!("Whisper filter not found - audio transcription will be skipped");
+        }
+
+        let mut subtitle_decoders = BTreeMap::new();
+        let mut audio_decoder: Option<ffmpeg::decoder::Audio> = None;
+        let mut _whisper_filter_graph: Option<ffmpeg::filter::Graph> = None;
+        let mut whisper_source: Option<ffmpeg::filter::Context> = None;
+        let mut whisper_sink: Option<ffmpeg::filter::Context> = None;
+        let mut best_audio_stream_index: Option<usize> = None;
+
+        // Find best audio stream for whisper processing
+        if let Some(audio_stream) = input.streams().best(ffmpeg::media::Type::Audio) {
+            best_audio_stream_index = Some(audio_stream.index());
+
+            // Get audio parameters safely
+            let codec_id = audio_stream.parameters().id();
+            let channels = if let Ok(context) =
+                ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters())
+            {
+                if let Ok(audio) = context.decoder().audio() {
+                    audio.channels()
+                } else {
+                    0
+                }
+            } else {
+                0
+            };
+            let sample_rate = if let Ok(context) =
+                ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters())
+            {
+                if let Ok(audio) = context.decoder().audio() {
+                    audio.rate()
+                } else {
+                    0
+                }
+            } else {
+                0
+            };
+
+            info!(
+                "Found best audio stream: index {} (codec: {:?}, channels: {}, sample_rate: {})",
+                audio_stream.index(),
+                codec_id,
+                channels,
+                sample_rate
+            );
+        } else {
+            info!("No audio stream found for whisper processing");
+        }
+
+        // Set up whisper filter graph if we found an audio stream
+        if let Some(audio_index) = best_audio_stream_index {
+            info!("Setting up whisper filter for audio stream {}", audio_index);
+
+            let audio_stream = input.stream(audio_index).unwrap();
+            if let Ok(context) =
+                ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters())
+            {
+                if let Ok(decoder) = context.decoder().audio() {
+                    // Get decoder properties before moving it
+                    let decoder_rate = decoder.rate();
+                    let decoder_format = decoder.format();
+                    let decoder_channel_layout = decoder.channel_layout().bits();
+
+                    audio_decoder = Some(decoder);
+
+                    // Set up whisper filter graph
+                    debug!("Creating whisper filter graph...");
+                    debug!(
+                        "Audio stream time_base: {}, decoder rate: {}, format: {:?}, channel_layout: 0x{:x}",
+                        audio_stream.time_base(),
+                        decoder_rate,
+                        decoder_format,
+                        decoder_channel_layout
+                    );
+                    match self.setup_whisper_filter(&audio_stream) {
+                        Ok((graph, source, sink)) => {
+                            info!("Whisper filter graph created successfully");
+                            _whisper_filter_graph = Some(graph);
+                            whisper_source = Some(source);
+                            whisper_sink = Some(sink);
+                            debug!("Whisper source and sink contexts stored");
+
+                            // Create a generated subtitle track
+                            let track = SubtitleTrack {
+                                language: Some(isolang::Language::from_639_1("en").unwrap_or_else(
+                                    || isolang::Language::from_639_3("eng").unwrap(),
+                                )),
+                                title: Some("Generated from Audio (Whisper)".to_string()),
+                                cues: Vec::new(),
+                                is_generated: true,
+                            };
+
+                            let whisper_stream_index = 1000; // Use high index for generated tracks
+                            TRACKS.write().insert(whisper_stream_index, track);
+
+                            sender
+                                .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata(
+                                    whisper_stream_index,
+                                ))
+                                .unwrap();
+                        }
+                        Err(e) => {
+                            error!("Failed to setup whisper filter: {}", e);
+                            debug!("Whisper filter error details: {:?}", e);
+                            warn!(
+                                "Audio transcription will be skipped due to filter setup failure"
+                            );
+                        }
+                    }
+                }
+            }
+        }
+
+        // Create decoder for each subtitle stream
+        for (stream_index, stream) in input.streams().enumerate() {
+            if stream.parameters().medium() == ffmpeg::media::Type::Subtitle {
+                let language_code = stream.metadata().get("language").map(|s| s.to_string());
+                let title = stream.metadata().get("title").map(|s| s.to_string());
+
+                let track = SubtitleTrack {
+                    language: language_code.and_then(|code| isolang::Language::from_639_2b(&code)),
+                    title,
+                    cues: Vec::new(),
+                    is_generated: false,
+                };
+
+                TRACKS.write().insert(stream_index, track);
+
+                sender
+                    .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata(
+                        stream_index,
+                    ))
+                    .unwrap();
+
+                let context =
+                    ffmpeg::codec::context::Context::from_parameters(stream.parameters())?;
+                if let Ok(decoder) = context.decoder().subtitle() {
+                    subtitle_decoders.insert(stream_index, decoder);
+                    debug!("Created decoder for subtitle stream {}", stream_index);
+                } else {
+                    error!(
+                        "Failed to create decoder for subtitle stream {}",
+                        stream_index
+                    );
+                }
+            } else {
+                debug!(
+                    "Failed to create context for subtitle stream {}",
+                    stream_index
+                );
+            }
+        }
+
+        // Process packets
+        for (stream, packet) in input.packets() {
+            let stream_index = stream.index();
+
+            // Process subtitle packets
+            if let Some(decoder) = subtitle_decoders.get_mut(&stream_index) {
+                let mut subtitle = ffmpeg::Subtitle::new();
+                if decoder.decode(&packet, &mut subtitle).is_ok() {
+                    if let Some(cue) = Self::subtitle_to_cue(&subtitle, &packet, stream.time_base())
+                    {
+                        if let Some(track) = TRACKS.write().get_mut(&stream_index) {
+                            track.cues.push(cue.clone());
+                        }
+
+                        sender
+                            .output(SubtitleExtractorOutput::NewCue(stream_index, cue))
+                            .unwrap();
+                    }
+                }
+            }
+
+            // Process audio packets for whisper
+            if Some(stream_index) == best_audio_stream_index {
+                debug!(
+                    "Processing audio packet for whisper (stream: {}, pts: {:?}, duration: {:?})",
+                    stream_index,
+                    packet.pts(),
+                    packet.duration()
+                );
+                debug!(
+                    "Audio decoder available: {}, Whisper source available: {}",
+                    audio_decoder.is_some(),
+                    whisper_source.is_some()
+                );
+                if let (Some(decoder), Some(source)) = (&mut audio_decoder, &mut whisper_source) {
+                    debug!("Both audio decoder and whisper source are available, processing...");
+                    // Send packet to audio decoder
+                    if let Err(e) = decoder.send_packet(&packet) {
+                        debug!("Failed to send packet to audio decoder: {}", e);
+                    }
+
+                    // Get decoded frames and send to whisper filter
+                    let mut frame = unsafe { ffmpeg::Frame::empty() };
+                    let mut frame_count = 0;
+                    while decoder.receive_frame(&mut frame).is_ok() {
+                        frame_count += 1;
+                        debug!(
+                            "Decoded audio frame {} (pts: {:?})",
+                            frame_count,
+                            frame.pts()
+                        );
+
+                        // Add frame to whisper filter
+                        if let Err(e) = source.source().add(&frame) {
+                            error!("Failed to add frame to whisper filter: {}", e);
+                        } else {
+                            debug!("Successfully added frame to whisper filter");
+                        }
+
+                        // Check for whisper output after adding each frame
+                        if let Some(sink) = &mut whisper_sink {
+                            self.check_whisper_output(sink, sender)?;
+                        }
+                    }
+                    if frame_count > 0 {
+                        debug!("Processed {} audio frames for whisper", frame_count);
+                    }
+                } else {
+                    debug!("Skipping audio packet - decoder or whisper source not available");
+                }
+            }
+        }
+
+        // Flush audio decoder and whisper filter
+        if let (Some(decoder), Some(source), Some(sink)) =
+            (&mut audio_decoder, &mut whisper_source, &mut whisper_sink)
+        {
+            info!("Flushing audio decoder and whisper filter...");
+            // Flush decoder
+            if let Err(e) = decoder.send_eof() {
+                debug!("Failed to send EOF to decoder: {}", e);
+            }
+            let mut frame = unsafe { ffmpeg::Frame::empty() };
+            let mut final_frame_count = 0;
+            while decoder.receive_frame(&mut frame).is_ok() {
+                final_frame_count += 1;
+                source.source().add(&frame).ok();
+            }
+            debug!("Flushed {} final frames from decoder", final_frame_count);
+
+            // Flush filter and get results
+            debug!("Flushing whisper filter...");
+            if let Err(e) = source.source().flush() {
+                error!("Failed to flush whisper filter: {}", e);
+            }
+
+            info!("Processing final whisper filter output...");
+            self.check_whisper_output(sink, sender)?;
+        }
+
+        Ok(())
+    }
+
+    fn setup_whisper_filter(
+        &self,
+        audio_stream: &ffmpeg::Stream,
+    ) -> Result<(
+        ffmpeg::filter::Graph,
+        ffmpeg::filter::Context,
+        ffmpeg::filter::Context,
+    )> {
+        debug!("Setting up whisper filter graph...");
+        let mut filter_graph = ffmpeg::filter::Graph::new();
+        debug!("Filter graph created successfully");
+
+        // Get audio parameters
+        debug!("Getting audio parameters...");
+        let time_base = audio_stream.time_base();
+        let audio_params = audio_stream.parameters();
+        debug!("Creating context from parameters...");
+        let context = ffmpeg::codec::context::Context::from_parameters(audio_params)?;
+        debug!("Getting audio decoder from context...");
+        let audio_decoder = context.decoder().audio()?;
+        debug!("Audio decoder created successfully");
+
+        // Create buffer source
+        let buffer_args = format!(
+            "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
+            time_base,
+            audio_decoder.rate(),
+            audio_decoder.format().name(),
+            audio_decoder.channel_layout().bits()
+        );
+        debug!("Buffer args: {}", buffer_args);
+
+        debug!("Looking for abuffer filter...");
+        let abuffer_filter = ffmpeg::filter::find("abuffer")
+            .ok_or_else(|| anyhow::anyhow!("abuffer filter not found"))?;
+        debug!("abuffer filter found: {}", abuffer_filter.name());
+
+        debug!("Adding abuffer filter...");
+        match filter_graph.add(&abuffer_filter, "src", &buffer_args) {
+            Ok(_) => debug!("abuffer filter added successfully"),
+            Err(e) => {
+                error!("Failed to add abuffer filter: {}", e);
+                return Err(anyhow::anyhow!("Failed to add abuffer filter: {}", e));
+            }
+        }
+
+        // Create whisper filter with parameters
+        // Try absolute path and different parameter formats
+        let model_path = std::path::Path::new("./whisper-models/ggml-large-v3.bin");
+        let absolute_path = if model_path.exists() {
+            model_path
+                .canonicalize()
+                .map(|p| p.to_string_lossy().to_string())
+                .unwrap_or_else(|_| "./whisper-models/ggml-large-v3.bin".to_string())
+        } else {
+            warn!("Whisper model file not found at: {:?}", model_path);
+            "./whisper-models/ggml-large-v3.bin".to_string()
+        };
+
+        debug!("Model path exists: {}", model_path.exists());
+        debug!("Using absolute path: {}", absolute_path);
+
+        debug!("Looking for whisper filter...");
+        let whisper_filter = ffmpeg::filter::find("whisper").ok_or_else(|| {
+            error!("Whisper filter not found! Make sure FFmpeg was compiled with whisper support");
+            anyhow::anyhow!("Whisper filter not available")
+        })?;
+
+        debug!("Whisper filter found: {}", whisper_filter.name());
+        // We'll create the whisper filter through the parse method instead of adding it manually
+
+        // Create audio buffer sink for whisper output (whisper outputs audio + metadata)
+        debug!("Looking for abuffersink filter for audio output...");
+        let abuffersink_filter = ffmpeg::filter::find("abuffersink")
+            .ok_or_else(|| anyhow::anyhow!("abuffersink filter not found"))?;
+        debug!("abuffersink filter found: {}", abuffersink_filter.name());
+
+        debug!("Adding abuffersink filter...");
+        match filter_graph.add(&abuffersink_filter, "sink", "") {
+            Ok(_) => debug!("abuffersink filter added successfully"),
+            Err(e) => {
+                error!("Failed to add abuffersink filter: {}", e);
+                return Err(anyhow::anyhow!("Failed to add abuffersink filter: {}", e));
+            }
+        }
+
+        // Connect filters using the complete filter chain description
+        debug!("Connecting filter graph with complete chain: src -> whisper -> sink");
+
+        let filter_chain = format!(
+            "[src]whisper=model={}:queue=30:format=json[sink]",
+            "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin",
+            //"/Users/malte/repos/lleap/whisper-models/ggml-silero-v5.1.2.bin"
+        );
+        debug!("Using filter chain: {}", filter_chain);
+
+        if let Err(e) = filter_graph
+            .output("src", 0)
+            .and_then(|o| o.input("sink", 0))
+            .and_then(|i| i.parse(&filter_chain))
+        {
+            error!("Failed to connect filter graph: {}", e);
+            return Err(anyhow::anyhow!("Failed to connect filter graph: {}", e));
+        }
+        debug!("Filter graph connected successfully");
+
+        // Validate filter graph
+        debug!("Validating filter graph...");
+        match filter_graph.validate() {
+            Ok(_) => {
+                info!("Filter graph validated successfully");
+                debug!("Filter graph dump:\n{}", filter_graph.dump());
+            }
+            Err(e) => {
+                error!("Filter graph validation failed: {}", e);
+                debug!(
+                    "Filter graph dump before validation failure:\n{}",
+                    filter_graph.dump()
+                );
+                return Err(anyhow::anyhow!("Filter graph validation failed: {}", e));
+            }
+        }
+
+        debug!("Getting final source and sink contexts...");
+        let source_ctx = filter_graph
+            .get("src")
+            .ok_or_else(|| anyhow::anyhow!("Source context not found"))?;
+        let sink_ctx = filter_graph
+            .get("sink")
+            .ok_or_else(|| anyhow::anyhow!("Sink context not found"))?;
+        debug!("Final contexts retrieved successfully");
+
+        Ok((filter_graph, source_ctx, sink_ctx))
+    }
+
+    fn check_whisper_output(
+        &self,
+        sink: &mut ffmpeg::filter::Context,
+        sender: &ComponentSender<Self>,
+    ) -> Result<()> {
+        debug!("Attempting to read audio frames from whisper filter output...");
+
+        // The whisper filter outputs audio frames with subtitle data in "lavfi.whisper.text" metadata
+        let mut frame = unsafe { ffmpeg::Frame::empty() };
+        let mut output_count = 0;
+
+        while sink.sink().frame(&mut frame).is_ok() {
+            output_count += 1;
+            debug!(
+                "Received audio frame {} from whisper filter (pts: {:?})",
+                output_count,
+                frame.pts()
+            );
+
+            // Look specifically for lavfi.whisper.text metadata
+            if let Some(whisper_text) = frame.metadata().get("lavfi.whisper.text") {
+                info!("Found whisper transcription: {}", whisper_text);
+
+                let start_time = if let Some(pts) = frame.pts() {
+                    // Convert PTS to nanoseconds based on whisper filter's time base (16kHz)
+                    gst::ClockTime::from_nseconds((pts as u64 * 1_000_000_000) / 16000)
+                } else {
+                    gst::ClockTime::ZERO
+                };
+
+                // Log all available metadata keys to help debug
+                let metadata_entries: Vec<(String, String)> = frame
+                    .metadata()
+                    .iter()
+                    .map(|(k, v)| (k.to_string(), v.to_string()))
+                    .collect();
+                if !metadata_entries.is_empty() {
+                    let metadata_keys: Vec<String> =
+                        metadata_entries.iter().map(|(k, _)| k.clone()).collect();
+                    debug!("Frame metadata keys: {:?}", metadata_keys);
+                }
+
+                // Parse the whisper text (might be JSON format)
+                self.parse_whisper_text(whisper_text, start_time, sender)?;
+            }
+        }
+
+        if output_count > 0 {
+            info!("Processed {} frames from whisper filter", output_count);
+        } else {
+            debug!("No frames available from whisper filter");
+        }
+
+        Ok(())
+    }
+
+    fn parse_whisper_text(
+        &self,
+        whisper_text: &str,
+        base_time: gst::ClockTime,
+        sender: &ComponentSender<Self>,
+    ) -> Result<()> {
+        debug!("Parsing whisper text: {}", whisper_text);
+
+        // The whisper text might be in different formats depending on the filter configuration
+        // For now, treat it as plain text and create a single cue
+        let cue = SubtitleCue {
+            start: base_time,
+            end: base_time + gst::ClockTime::from_seconds(3), // Default 3 second duration
+            text: whisper_text.to_string(),
+        };
+
+        let whisper_stream_index = 1000;
+        if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) {
+            track.cues.push(cue.clone());
+        }
+
+        sender
+            .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue))
+            .unwrap();
+
+        Ok(())
+    }
+
+    fn parse_whisper_subtitle_data(
+        &self,
+        subtitle_data: &str,
+        sender: &ComponentSender<Self>,
+    ) -> Result<()> {
+        // Parse SRT-format output from whisper
+        info!(
+            "Parsing whisper subtitle data ({} characters)",
+            subtitle_data.len()
+        );
+        debug!("Subtitle data content:\n{}", subtitle_data);
+        let lines: Vec<&str> = subtitle_data.lines().collect();
+        let mut i = 0;
+
+        while i < lines.len() {
+            // Skip subtitle number
+            if lines[i].trim().parse::<i32>().is_ok() {
+                i += 1;
+            }
+
+            // Parse timestamp line
+            if i < lines.len() {
+                if let Some((start, end)) = self.parse_srt_timestamp(lines[i]) {
+                    i += 1;
+
+                    // Collect text lines
+                    let mut text_lines = Vec::new();
+                    while i < lines.len() && !lines[i].trim().is_empty() {
+                        text_lines.push(lines[i].to_string());
+                        i += 1;
+                    }
+
+                    if !text_lines.is_empty() {
+                        let cue = SubtitleCue {
+                            start,
+                            end,
+                            text: text_lines.join("\n"),
+                        };
+
+                        let whisper_stream_index = 1000;
+                        if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) {
+                            track.cues.push(cue.clone());
+                        }
+
+                        sender
+                            .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue))
+                            .unwrap();
+                    }
+                }
+            }
+            i += 1;
+        }
+
+        Ok(())
+    }
+
+    fn parse_srt_timestamp(&self, line: &str) -> Option<(gst::ClockTime, gst::ClockTime)> {
+        // Parse SRT timestamp format: "00:00:01,234 --> 00:00:05,678"
+        let parts: Vec<&str> = line.split(" --> ").collect();
+        if parts.len() != 2 {
+            return None;
+        }
+
+        let start = self.parse_srt_time(parts[0])?;
+        let end = self.parse_srt_time(parts[1])?;
+
+        Some((start, end))
+    }
+
+    fn parse_srt_time(&self, time_str: &str) -> Option<gst::ClockTime> {
+        // Parse SRT time format: "00:00:01,234"
+        let parts: Vec<&str> = time_str.split(',').collect();
+        if parts.len() != 2 {
+            return None;
+        }
+
+        let time_part = parts[0];
+        let millis: u32 = parts[1].parse().ok()?;
+
+        let time_components: Vec<&str> = time_part.split(':').collect();
+        if time_components.len() != 3 {
+            return None;
+        }
+
+        let hours: u32 = time_components[0].parse().ok()?;
+        let minutes: u32 = time_components[1].parse().ok()?;
+        let seconds: u32 = time_components[2].parse().ok()?;
+
+        let total_millis = hours * 3600000 + minutes * 60000 + seconds * 1000 + millis;
+        let nanoseconds = total_millis as u64 * 1_000_000;
+
+        Some(gst::ClockTime::from_nseconds(nanoseconds))
+    }
+
+    fn subtitle_to_cue(
+        subtitle: &ffmpeg::Subtitle,
+        packet: &ffmpeg::Packet,
+        time_base: Rational,
+    ) -> Option<SubtitleCue> {
+        let time_to_clock_time = |time: i64| {
+            let nseconds: i64 = (time * time_base.numerator() as i64 * 1_000_000_000)
+                / time_base.denominator() as i64;
+            gst::ClockTime::from_nseconds(nseconds as u64)
+        };
+
+        let text = subtitle
+            .rects()
+            .into_iter()
+            .map(|rect| match rect {
+                ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
+                ffmpeg::subtitle::Rect::Ass(ass) => {
+                    Self::extract_dialogue_text(ass.get()).unwrap_or(String::new())
+                }
+                _ => String::new(),
+            })
+            .collect::<Vec<String>>()
+            .join("\n— ");
+
+        let start = time_to_clock_time(packet.pts()?);
+        let end = time_to_clock_time(packet.pts()? + packet.duration());
+
+        Some(SubtitleCue { start, end, text })
+    }
+
+    fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
+        // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
+        // we need the 9th field (Text), so split on comma but only take first 9 splits
+        // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
+        let text = dialogue_line.splitn(9, ',').last()?;
+
+        // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
+        let mut result = String::new();
+        let mut in_tag = false;
+        let mut char_iter = text.chars().peekable();
+
+        while let Some(c) = char_iter.next() {
+            if c == '{' && char_iter.peek() == Some(&'\\') {
+                in_tag = true;
+            } else if c == '}' {
+                in_tag = false;
+            } else if !in_tag {
+                // process line breaks and hard spaces
+                if c == '\\' {
+                    match char_iter.peek() {
+                        Some(&'N') => {
+                            char_iter.next();
+                            result.push('\n');
+                        }
+                        Some(&'n') | Some(&'h') => {
+                            char_iter.next();
+                            result.push(' ');
+                        }
+                        _ => result.push(c),
+                    }
+                } else {
+                    result.push(c);
+                }
+            }
+        }
+
+        Some(result)
+    }
+}