diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/app.rs | 157 | ||||
| -rw-r--r-- | src/subtitle_extractor_aishit.rs | 732 |
2 files changed, 91 insertions, 798 deletions
diff --git a/src/app.rs b/src/app.rs index 7aa5abd..066980c 100644 --- a/src/app.rs +++ b/src/app.rs @@ -12,7 +12,7 @@ use crate::{ subtitle_view::{SubtitleView, SubtitleViewMsg, SubtitleViewOutput}, tracks::{SUBTITLE_TRACKS, StreamIndex, SubtitleCue}, transcript::{Transcript, TranscriptMsg, TranscriptOutput}, - util::OptionTracker, + util::{OptionTracker, Tracker}, }; pub struct App { @@ -26,13 +26,14 @@ pub struct App { subtitle_selection_dialog: Controller<SubtitleSelectionDialog>, primary_stream_ix: Option<StreamIndex>, - primary_last_cue_ix: OptionTracker<usize>, + primary_cue: Tracker<Option<String>>, + primary_last_cue_ix: Tracker<Option<usize>>, + secondary_cue: Tracker<Option<String>>, secondary_stream_ix: Option<StreamIndex>, - secondary_last_cue_ix: OptionTracker<usize>, + secondary_last_cue_ix: Tracker<Option<usize>>, // for auto-pausing autopaused: bool, - primary_cue_active: bool, hovering_primary_cue: bool, } @@ -162,12 +163,13 @@ impl SimpleComponent for App { subtitle_selection_dialog, primary_stream_ix: None, - primary_last_cue_ix: OptionTracker::new(None), + primary_cue: Tracker::new(None), + primary_last_cue_ix: Tracker::new(None), secondary_stream_ix: None, - secondary_last_cue_ix: OptionTracker::new(None), + secondary_cue: Tracker::new(None), + secondary_last_cue_ix: Tracker::new(None), autopaused: false, - primary_cue_active: false, hovering_primary_cue: false, }; @@ -177,9 +179,6 @@ impl SimpleComponent for App { } fn update(&mut self, message: Self::Input, _sender: ComponentSender<Self>) { - self.primary_last_cue_ix.reset(); - self.secondary_last_cue_ix.reset(); - match message { AppMsg::NewCue(stream_index, cue) => { self.transcript @@ -203,20 +202,41 @@ impl SimpleComponent for App { } AppMsg::PositionUpdate(pos) => { if let Some(stream_ix) = self.primary_stream_ix { - let cue = - Self::get_cue_and_update_ix(stream_ix, pos, &mut self.primary_last_cue_ix); - let cue_is_some = cue.is_some(); - - // beginning of new subtitle - if self.primary_last_cue_ix.is_dirty() - || (!self.primary_cue_active && cue_is_some) - { + // sometimes we get a few position update messages after + // auto-pausing; this prevents us from immediately un-autopausing + // again + if self.autopaused { + return; + } + + let cue_was_some = self.primary_cue.get().is_some(); + + Self::update_cue( + stream_ix, + pos, + &mut self.primary_cue, + &mut self.primary_last_cue_ix, + ); + + if self.primary_cue.is_dirty() { + // last cue just ended -> auto-pause + if cue_was_some && self.hovering_primary_cue { + self.player.sender().send(PlayerMsg::Pause).unwrap(); + self.autopaused = true; + return; + } + self.subtitle_view .sender() - .send(SubtitleViewMsg::SetPrimaryCue(cue)) + .send(SubtitleViewMsg::SetPrimaryCue( + self.primary_cue.get().clone(), + )) .unwrap(); - self.primary_cue_active = cue_is_some; + self.primary_cue.reset(); + } + + if self.primary_last_cue_ix.is_dirty() { if let Some(ix) = self.primary_last_cue_ix.get() { self.transcript .sender() @@ -226,33 +246,24 @@ impl SimpleComponent for App { self.primary_last_cue_ix.reset(); } - - // end of current subtitle - if self.primary_cue_active && !cue_is_some && !self.autopaused { - if self.hovering_primary_cue { - self.player.sender().send(PlayerMsg::Pause).unwrap(); - self.autopaused = true; - } else { - self.subtitle_view - .sender() - .send(SubtitleViewMsg::SetPrimaryCue(None)) - .unwrap(); - self.primary_cue_active = false; - } - } } if let Some(stream_ix) = self.secondary_stream_ix { - if !self.autopaused { + Self::update_cue( + stream_ix, + pos, + &mut self.secondary_cue, + &mut self.secondary_last_cue_ix, + ); + + if !self.autopaused && self.secondary_cue.is_dirty() { self.subtitle_view .sender() .send(SubtitleViewMsg::SetSecondaryCue( - Self::get_cue_and_update_ix( - stream_ix, - pos, - &mut self.secondary_last_cue_ix, - ), + self.secondary_cue.get().clone(), )) .unwrap(); + + self.secondary_cue.reset(); } } } @@ -302,50 +313,64 @@ impl SimpleComponent for App { } impl App { - fn get_cue_and_update_ix( + fn update_cue( stream_ix: StreamIndex, position: gst::ClockTime, - last_cue_ix: &mut OptionTracker<usize>, - ) -> Option<String> { + cue: &mut Tracker<Option<String>>, + last_cue_ix: &mut Tracker<Option<usize>>, + ) { let lock = SUBTITLE_TRACKS.read(); - let track = lock.get(&stream_ix)?; + let track = lock.get(&stream_ix).unwrap(); // try to find current cue quickly (should usually succeed during playback) if let Some(ix) = last_cue_ix.get() { - let last_cue = track.cues.get(*ix)?; + let last_cue = track.cues.get(*ix).unwrap(); if last_cue.start <= position && position <= last_cue.end { - return Some(last_cue.text.clone()); - } - let next_cue = track.cues.get(ix + 1)?; - if last_cue.end < position && position < next_cue.start { - return None; - } - if next_cue.start <= position && position <= next_cue.end { - last_cue_ix.set(Some(ix + 1)); - return Some(next_cue.text.clone()); + // still at current cue + return; + } else if let Some(next_cue) = track.cues.get(ix + 1) { + if last_cue.end < position && position < next_cue.start { + // strictly between cues + cue.set(None); + return; + } + if next_cue.start <= position && position <= next_cue.end { + // already in next cue (this happens when one cue immediately + // follows the previous one) + cue.set(Some(next_cue.text.clone())); + last_cue_ix.set(Some(ix + 1)); + return; + } } } // if we are before the first subtitle, no need to look further - if position < track.cues.first()?.start { + if track.cues.is_empty() || position < track.cues.first().unwrap().start { + cue.set(None); last_cue_ix.set(None); - return None; + return; } // otherwise, search the whole track (e.g. after seeking) - let (ix, cue) = track + match track .cues .iter() .enumerate() .rev() - .find(|(_ix, cue)| cue.start <= position)?; - - last_cue_ix.set(Some(ix)); - - if position <= cue.end { - Some(cue.text.clone()) - } else { - None - } + .find(|(_ix, cue)| cue.start <= position) + { + Some((ix, new_cue)) => { + last_cue_ix.set(Some(ix)); + if position <= new_cue.end { + cue.set(Some(new_cue.text.clone())); + } else { + cue.set(None); + } + } + None => { + cue.set(None); + last_cue_ix.set(None); + } + }; } } diff --git a/src/subtitle_extractor_aishit.rs b/src/subtitle_extractor_aishit.rs deleted file mode 100644 index c615f6c..0000000 --- a/src/subtitle_extractor_aishit.rs +++ /dev/null @@ -1,732 +0,0 @@ -use std::collections::BTreeMap; - -use anyhow::Result; - -use ffmpeg::Rational; -use log::{debug, error, info, warn}; -use relm4::{ComponentSender, SharedState, Worker}; - -pub type StreamIndex = usize; - -#[derive(Debug, Clone)] -pub struct SubtitleCue { - pub start: gst::ClockTime, - pub end: gst::ClockTime, - pub text: String, -} - -#[derive(Debug, Clone)] -pub struct SubtitleTrack { - pub language: Option<isolang::Language>, - pub title: Option<String>, - pub cues: Vec<SubtitleCue>, - pub is_generated: bool, // true if generated from audio -} - -pub static TRACKS: SharedState<BTreeMap<StreamIndex, SubtitleTrack>> = SharedState::new(); - -pub struct SubtitleExtractor {} - -#[derive(Debug)] -pub enum SubtitleExtractorMsg { - ExtractFromUrl(String), -} - -#[derive(Debug)] -pub enum SubtitleExtractorOutput { - NewOrUpdatedTrackMetadata(StreamIndex), - NewCue(StreamIndex, SubtitleCue), - ExtractionComplete, -} - -impl Worker for SubtitleExtractor { - type Init = (); - type Input = SubtitleExtractorMsg; - type Output = SubtitleExtractorOutput; - - fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self { - Self {} - } - - fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) { - match msg { - SubtitleExtractorMsg::ExtractFromUrl(url) => { - self.handle_extract_from_url(url, sender); - } - } - } -} - -impl SubtitleExtractor { - fn handle_extract_from_url(&mut self, url: String, sender: ComponentSender<Self>) { - // Clear existing tracks - TRACKS.write().clear(); - - // Try to extract subtitles using ffmpeg - match self.extract_subtitles_ffmpeg(&url, &sender) { - Ok(_) => { - info!("Subtitle extraction completed successfully"); - sender - .output(SubtitleExtractorOutput::ExtractionComplete) - .unwrap(); - } - Err(e) => { - error!("FFmpeg extraction failed: {}", e); - } - } - } - - fn extract_subtitles_ffmpeg(&self, url: &str, sender: &ComponentSender<Self>) -> Result<()> { - info!("Starting subtitle extraction from: {}", url); - let mut input = ffmpeg::format::input(&url)?; - - // Log input format info - info!( - "Input format: {} ({} streams)", - input.format().name(), - input.streams().count() - ); - - // Check if whisper filter is available - if let Some(whisper_filter) = ffmpeg::filter::find("whisper") { - info!("Whisper filter found: {}", whisper_filter.name()); - } else { - warn!("Whisper filter not found - audio transcription will be skipped"); - } - - let mut subtitle_decoders = BTreeMap::new(); - let mut audio_decoder: Option<ffmpeg::decoder::Audio> = None; - let mut _whisper_filter_graph: Option<ffmpeg::filter::Graph> = None; - let mut whisper_source: Option<ffmpeg::filter::Context> = None; - let mut whisper_sink: Option<ffmpeg::filter::Context> = None; - let mut best_audio_stream_index: Option<usize> = None; - - // Find best audio stream for whisper processing - if let Some(audio_stream) = input.streams().best(ffmpeg::media::Type::Audio) { - best_audio_stream_index = Some(audio_stream.index()); - - // Get audio parameters safely - let codec_id = audio_stream.parameters().id(); - let channels = if let Ok(context) = - ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) - { - if let Ok(audio) = context.decoder().audio() { - audio.channels() - } else { - 0 - } - } else { - 0 - }; - let sample_rate = if let Ok(context) = - ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) - { - if let Ok(audio) = context.decoder().audio() { - audio.rate() - } else { - 0 - } - } else { - 0 - }; - - info!( - "Found best audio stream: index {} (codec: {:?}, channels: {}, sample_rate: {})", - audio_stream.index(), - codec_id, - channels, - sample_rate - ); - } else { - info!("No audio stream found for whisper processing"); - } - - // Set up whisper filter graph if we found an audio stream - if let Some(audio_index) = best_audio_stream_index { - info!("Setting up whisper filter for audio stream {}", audio_index); - - let audio_stream = input.stream(audio_index).unwrap(); - if let Ok(context) = - ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) - { - if let Ok(decoder) = context.decoder().audio() { - // Get decoder properties before moving it - let decoder_rate = decoder.rate(); - let decoder_format = decoder.format(); - let decoder_channel_layout = decoder.channel_layout().bits(); - - audio_decoder = Some(decoder); - - // Set up whisper filter graph - debug!("Creating whisper filter graph..."); - debug!( - "Audio stream time_base: {}, decoder rate: {}, format: {:?}, channel_layout: 0x{:x}", - audio_stream.time_base(), - decoder_rate, - decoder_format, - decoder_channel_layout - ); - match self.setup_whisper_filter(&audio_stream) { - Ok((graph, source, sink)) => { - info!("Whisper filter graph created successfully"); - _whisper_filter_graph = Some(graph); - whisper_source = Some(source); - whisper_sink = Some(sink); - debug!("Whisper source and sink contexts stored"); - - // Create a generated subtitle track - let track = SubtitleTrack { - language: Some(isolang::Language::from_639_1("en").unwrap_or_else( - || isolang::Language::from_639_3("eng").unwrap(), - )), - title: Some("Generated from Audio (Whisper)".to_string()), - cues: Vec::new(), - is_generated: true, - }; - - let whisper_stream_index = 1000; // Use high index for generated tracks - TRACKS.write().insert(whisper_stream_index, track); - - sender - .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata( - whisper_stream_index, - )) - .unwrap(); - } - Err(e) => { - error!("Failed to setup whisper filter: {}", e); - debug!("Whisper filter error details: {:?}", e); - warn!( - "Audio transcription will be skipped due to filter setup failure" - ); - } - } - } - } - } - - // Create decoder for each subtitle stream - for (stream_index, stream) in input.streams().enumerate() { - if stream.parameters().medium() == ffmpeg::media::Type::Subtitle { - let language_code = stream.metadata().get("language").map(|s| s.to_string()); - let title = stream.metadata().get("title").map(|s| s.to_string()); - - let track = SubtitleTrack { - language: language_code.and_then(|code| isolang::Language::from_639_2b(&code)), - title, - cues: Vec::new(), - is_generated: false, - }; - - TRACKS.write().insert(stream_index, track); - - sender - .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata( - stream_index, - )) - .unwrap(); - - let context = - ffmpeg::codec::context::Context::from_parameters(stream.parameters())?; - if let Ok(decoder) = context.decoder().subtitle() { - subtitle_decoders.insert(stream_index, decoder); - debug!("Created decoder for subtitle stream {}", stream_index); - } else { - error!( - "Failed to create decoder for subtitle stream {}", - stream_index - ); - } - } else { - debug!( - "Failed to create context for subtitle stream {}", - stream_index - ); - } - } - - // Process packets - for (stream, packet) in input.packets() { - let stream_index = stream.index(); - - // Process subtitle packets - if let Some(decoder) = subtitle_decoders.get_mut(&stream_index) { - let mut subtitle = ffmpeg::Subtitle::new(); - if decoder.decode(&packet, &mut subtitle).is_ok() { - if let Some(cue) = Self::subtitle_to_cue(&subtitle, &packet, stream.time_base()) - { - if let Some(track) = TRACKS.write().get_mut(&stream_index) { - track.cues.push(cue.clone()); - } - - sender - .output(SubtitleExtractorOutput::NewCue(stream_index, cue)) - .unwrap(); - } - } - } - - // Process audio packets for whisper - if Some(stream_index) == best_audio_stream_index { - debug!( - "Processing audio packet for whisper (stream: {}, pts: {:?}, duration: {:?})", - stream_index, - packet.pts(), - packet.duration() - ); - debug!( - "Audio decoder available: {}, Whisper source available: {}", - audio_decoder.is_some(), - whisper_source.is_some() - ); - if let (Some(decoder), Some(source)) = (&mut audio_decoder, &mut whisper_source) { - debug!("Both audio decoder and whisper source are available, processing..."); - // Send packet to audio decoder - if let Err(e) = decoder.send_packet(&packet) { - debug!("Failed to send packet to audio decoder: {}", e); - } - - // Get decoded frames and send to whisper filter - let mut frame = unsafe { ffmpeg::Frame::empty() }; - let mut frame_count = 0; - while decoder.receive_frame(&mut frame).is_ok() { - frame_count += 1; - debug!( - "Decoded audio frame {} (pts: {:?})", - frame_count, - frame.pts() - ); - - // Add frame to whisper filter - if let Err(e) = source.source().add(&frame) { - error!("Failed to add frame to whisper filter: {}", e); - } else { - debug!("Successfully added frame to whisper filter"); - } - - // Check for whisper output after adding each frame - if let Some(sink) = &mut whisper_sink { - self.check_whisper_output(sink, sender)?; - } - } - if frame_count > 0 { - debug!("Processed {} audio frames for whisper", frame_count); - } - } else { - debug!("Skipping audio packet - decoder or whisper source not available"); - } - } - } - - // Flush audio decoder and whisper filter - if let (Some(decoder), Some(source), Some(sink)) = - (&mut audio_decoder, &mut whisper_source, &mut whisper_sink) - { - info!("Flushing audio decoder and whisper filter..."); - // Flush decoder - if let Err(e) = decoder.send_eof() { - debug!("Failed to send EOF to decoder: {}", e); - } - let mut frame = unsafe { ffmpeg::Frame::empty() }; - let mut final_frame_count = 0; - while decoder.receive_frame(&mut frame).is_ok() { - final_frame_count += 1; - source.source().add(&frame).ok(); - } - debug!("Flushed {} final frames from decoder", final_frame_count); - - // Flush filter and get results - debug!("Flushing whisper filter..."); - if let Err(e) = source.source().flush() { - error!("Failed to flush whisper filter: {}", e); - } - - info!("Processing final whisper filter output..."); - self.check_whisper_output(sink, sender)?; - } - - Ok(()) - } - - fn setup_whisper_filter( - &self, - audio_stream: &ffmpeg::Stream, - ) -> Result<( - ffmpeg::filter::Graph, - ffmpeg::filter::Context, - ffmpeg::filter::Context, - )> { - debug!("Setting up whisper filter graph..."); - let mut filter_graph = ffmpeg::filter::Graph::new(); - debug!("Filter graph created successfully"); - - // Get audio parameters - debug!("Getting audio parameters..."); - let time_base = audio_stream.time_base(); - let audio_params = audio_stream.parameters(); - debug!("Creating context from parameters..."); - let context = ffmpeg::codec::context::Context::from_parameters(audio_params)?; - debug!("Getting audio decoder from context..."); - let audio_decoder = context.decoder().audio()?; - debug!("Audio decoder created successfully"); - - // Create buffer source - let buffer_args = format!( - "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}", - time_base, - audio_decoder.rate(), - audio_decoder.format().name(), - audio_decoder.channel_layout().bits() - ); - debug!("Buffer args: {}", buffer_args); - - debug!("Looking for abuffer filter..."); - let abuffer_filter = ffmpeg::filter::find("abuffer") - .ok_or_else(|| anyhow::anyhow!("abuffer filter not found"))?; - debug!("abuffer filter found: {}", abuffer_filter.name()); - - debug!("Adding abuffer filter..."); - match filter_graph.add(&abuffer_filter, "src", &buffer_args) { - Ok(_) => debug!("abuffer filter added successfully"), - Err(e) => { - error!("Failed to add abuffer filter: {}", e); - return Err(anyhow::anyhow!("Failed to add abuffer filter: {}", e)); - } - } - - // Create whisper filter with parameters - // Try absolute path and different parameter formats - let model_path = std::path::Path::new("./whisper-models/ggml-large-v3.bin"); - let absolute_path = if model_path.exists() { - model_path - .canonicalize() - .map(|p| p.to_string_lossy().to_string()) - .unwrap_or_else(|_| "./whisper-models/ggml-large-v3.bin".to_string()) - } else { - warn!("Whisper model file not found at: {:?}", model_path); - "./whisper-models/ggml-large-v3.bin".to_string() - }; - - debug!("Model path exists: {}", model_path.exists()); - debug!("Using absolute path: {}", absolute_path); - - debug!("Looking for whisper filter..."); - let whisper_filter = ffmpeg::filter::find("whisper").ok_or_else(|| { - error!("Whisper filter not found! Make sure FFmpeg was compiled with whisper support"); - anyhow::anyhow!("Whisper filter not available") - })?; - - debug!("Whisper filter found: {}", whisper_filter.name()); - // We'll create the whisper filter through the parse method instead of adding it manually - - // Create audio buffer sink for whisper output (whisper outputs audio + metadata) - debug!("Looking for abuffersink filter for audio output..."); - let abuffersink_filter = ffmpeg::filter::find("abuffersink") - .ok_or_else(|| anyhow::anyhow!("abuffersink filter not found"))?; - debug!("abuffersink filter found: {}", abuffersink_filter.name()); - - debug!("Adding abuffersink filter..."); - match filter_graph.add(&abuffersink_filter, "sink", "") { - Ok(_) => debug!("abuffersink filter added successfully"), - Err(e) => { - error!("Failed to add abuffersink filter: {}", e); - return Err(anyhow::anyhow!("Failed to add abuffersink filter: {}", e)); - } - } - - // Connect filters using the complete filter chain description - debug!("Connecting filter graph with complete chain: src -> whisper -> sink"); - - let filter_chain = format!( - "[src]whisper=model={}:queue=30:format=json[sink]", - "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", - //"/Users/malte/repos/lleap/whisper-models/ggml-silero-v5.1.2.bin" - ); - debug!("Using filter chain: {}", filter_chain); - - if let Err(e) = filter_graph - .output("src", 0) - .and_then(|o| o.input("sink", 0)) - .and_then(|i| i.parse(&filter_chain)) - { - error!("Failed to connect filter graph: {}", e); - return Err(anyhow::anyhow!("Failed to connect filter graph: {}", e)); - } - debug!("Filter graph connected successfully"); - - // Validate filter graph - debug!("Validating filter graph..."); - match filter_graph.validate() { - Ok(_) => { - info!("Filter graph validated successfully"); - debug!("Filter graph dump:\n{}", filter_graph.dump()); - } - Err(e) => { - error!("Filter graph validation failed: {}", e); - debug!( - "Filter graph dump before validation failure:\n{}", - filter_graph.dump() - ); - return Err(anyhow::anyhow!("Filter graph validation failed: {}", e)); - } - } - - debug!("Getting final source and sink contexts..."); - let source_ctx = filter_graph - .get("src") - .ok_or_else(|| anyhow::anyhow!("Source context not found"))?; - let sink_ctx = filter_graph - .get("sink") - .ok_or_else(|| anyhow::anyhow!("Sink context not found"))?; - debug!("Final contexts retrieved successfully"); - - Ok((filter_graph, source_ctx, sink_ctx)) - } - - fn check_whisper_output( - &self, - sink: &mut ffmpeg::filter::Context, - sender: &ComponentSender<Self>, - ) -> Result<()> { - debug!("Attempting to read audio frames from whisper filter output..."); - - // The whisper filter outputs audio frames with subtitle data in "lavfi.whisper.text" metadata - let mut frame = unsafe { ffmpeg::Frame::empty() }; - let mut output_count = 0; - - while sink.sink().frame(&mut frame).is_ok() { - output_count += 1; - debug!( - "Received audio frame {} from whisper filter (pts: {:?})", - output_count, - frame.pts() - ); - - // Look specifically for lavfi.whisper.text metadata - if let Some(whisper_text) = frame.metadata().get("lavfi.whisper.text") { - info!("Found whisper transcription: {}", whisper_text); - - let start_time = if let Some(pts) = frame.pts() { - // Convert PTS to nanoseconds based on whisper filter's time base (16kHz) - gst::ClockTime::from_nseconds((pts as u64 * 1_000_000_000) / 16000) - } else { - gst::ClockTime::ZERO - }; - - // Log all available metadata keys to help debug - let metadata_entries: Vec<(String, String)> = frame - .metadata() - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(); - if !metadata_entries.is_empty() { - let metadata_keys: Vec<String> = - metadata_entries.iter().map(|(k, _)| k.clone()).collect(); - debug!("Frame metadata keys: {:?}", metadata_keys); - } - - // Parse the whisper text (might be JSON format) - self.parse_whisper_text(whisper_text, start_time, sender)?; - } - } - - if output_count > 0 { - info!("Processed {} frames from whisper filter", output_count); - } else { - debug!("No frames available from whisper filter"); - } - - Ok(()) - } - - fn parse_whisper_text( - &self, - whisper_text: &str, - base_time: gst::ClockTime, - sender: &ComponentSender<Self>, - ) -> Result<()> { - debug!("Parsing whisper text: {}", whisper_text); - - // The whisper text might be in different formats depending on the filter configuration - // For now, treat it as plain text and create a single cue - let cue = SubtitleCue { - start: base_time, - end: base_time + gst::ClockTime::from_seconds(3), // Default 3 second duration - text: whisper_text.to_string(), - }; - - let whisper_stream_index = 1000; - if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) { - track.cues.push(cue.clone()); - } - - sender - .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue)) - .unwrap(); - - Ok(()) - } - - fn parse_whisper_subtitle_data( - &self, - subtitle_data: &str, - sender: &ComponentSender<Self>, - ) -> Result<()> { - // Parse SRT-format output from whisper - info!( - "Parsing whisper subtitle data ({} characters)", - subtitle_data.len() - ); - debug!("Subtitle data content:\n{}", subtitle_data); - let lines: Vec<&str> = subtitle_data.lines().collect(); - let mut i = 0; - - while i < lines.len() { - // Skip subtitle number - if lines[i].trim().parse::<i32>().is_ok() { - i += 1; - } - - // Parse timestamp line - if i < lines.len() { - if let Some((start, end)) = self.parse_srt_timestamp(lines[i]) { - i += 1; - - // Collect text lines - let mut text_lines = Vec::new(); - while i < lines.len() && !lines[i].trim().is_empty() { - text_lines.push(lines[i].to_string()); - i += 1; - } - - if !text_lines.is_empty() { - let cue = SubtitleCue { - start, - end, - text: text_lines.join("\n"), - }; - - let whisper_stream_index = 1000; - if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) { - track.cues.push(cue.clone()); - } - - sender - .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue)) - .unwrap(); - } - } - } - i += 1; - } - - Ok(()) - } - - fn parse_srt_timestamp(&self, line: &str) -> Option<(gst::ClockTime, gst::ClockTime)> { - // Parse SRT timestamp format: "00:00:01,234 --> 00:00:05,678" - let parts: Vec<&str> = line.split(" --> ").collect(); - if parts.len() != 2 { - return None; - } - - let start = self.parse_srt_time(parts[0])?; - let end = self.parse_srt_time(parts[1])?; - - Some((start, end)) - } - - fn parse_srt_time(&self, time_str: &str) -> Option<gst::ClockTime> { - // Parse SRT time format: "00:00:01,234" - let parts: Vec<&str> = time_str.split(',').collect(); - if parts.len() != 2 { - return None; - } - - let time_part = parts[0]; - let millis: u32 = parts[1].parse().ok()?; - - let time_components: Vec<&str> = time_part.split(':').collect(); - if time_components.len() != 3 { - return None; - } - - let hours: u32 = time_components[0].parse().ok()?; - let minutes: u32 = time_components[1].parse().ok()?; - let seconds: u32 = time_components[2].parse().ok()?; - - let total_millis = hours * 3600000 + minutes * 60000 + seconds * 1000 + millis; - let nanoseconds = total_millis as u64 * 1_000_000; - - Some(gst::ClockTime::from_nseconds(nanoseconds)) - } - - fn subtitle_to_cue( - subtitle: &ffmpeg::Subtitle, - packet: &ffmpeg::Packet, - time_base: Rational, - ) -> Option<SubtitleCue> { - let time_to_clock_time = |time: i64| { - let nseconds: i64 = (time * time_base.numerator() as i64 * 1_000_000_000) - / time_base.denominator() as i64; - gst::ClockTime::from_nseconds(nseconds as u64) - }; - - let text = subtitle - .rects() - .into_iter() - .map(|rect| match rect { - ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(), - ffmpeg::subtitle::Rect::Ass(ass) => { - Self::extract_dialogue_text(ass.get()).unwrap_or(String::new()) - } - _ => String::new(), - }) - .collect::<Vec<String>>() - .join("\n— "); - - let start = time_to_clock_time(packet.pts()?); - let end = time_to_clock_time(packet.pts()? + packet.duration()); - - Some(SubtitleCue { start, end, text }) - } - - fn extract_dialogue_text(dialogue_line: &str) -> Option<String> { - // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text - // we need the 9th field (Text), so split on comma but only take first 9 splits - // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433 - let text = dialogue_line.splitn(9, ',').last()?; - - // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc. - let mut result = String::new(); - let mut in_tag = false; - let mut char_iter = text.chars().peekable(); - - while let Some(c) = char_iter.next() { - if c == '{' && char_iter.peek() == Some(&'\\') { - in_tag = true; - } else if c == '}' { - in_tag = false; - } else if !in_tag { - // process line breaks and hard spaces - if c == '\\' { - match char_iter.peek() { - Some(&'N') => { - char_iter.next(); - result.push('\n'); - } - Some(&'n') | Some(&'h') => { - char_iter.next(); - result.push(' '); - } - _ => result.push(c), - } - } else { - result.push(c); - } - } - } - - Some(result) - } -} |