use std::collections::BTreeMap; use anyhow::Result; use ffmpeg::Rational; use log::{debug, error, info, warn}; use relm4::{ComponentSender, SharedState, Worker}; pub type StreamIndex = usize; #[derive(Debug, Clone)] pub struct SubtitleCue { pub start: gst::ClockTime, pub end: gst::ClockTime, pub text: String, } #[derive(Debug, Clone)] pub struct SubtitleTrack { pub language: Option, pub title: Option, pub cues: Vec, pub is_generated: bool, // true if generated from audio } pub static TRACKS: SharedState> = SharedState::new(); pub struct SubtitleExtractor {} #[derive(Debug)] pub enum SubtitleExtractorMsg { ExtractFromUrl(String), } #[derive(Debug)] pub enum SubtitleExtractorOutput { NewOrUpdatedTrackMetadata(StreamIndex), NewCue(StreamIndex, SubtitleCue), ExtractionComplete, } impl Worker for SubtitleExtractor { type Init = (); type Input = SubtitleExtractorMsg; type Output = SubtitleExtractorOutput; fn init(_init: Self::Init, _sender: ComponentSender) -> Self { Self {} } fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender) { match msg { SubtitleExtractorMsg::ExtractFromUrl(url) => { self.handle_extract_from_url(url, sender); } } } } impl SubtitleExtractor { fn handle_extract_from_url(&mut self, url: String, sender: ComponentSender) { // Clear existing tracks TRACKS.write().clear(); // Try to extract subtitles using ffmpeg match self.extract_subtitles_ffmpeg(&url, &sender) { Ok(_) => { info!("Subtitle extraction completed successfully"); sender .output(SubtitleExtractorOutput::ExtractionComplete) .unwrap(); } Err(e) => { error!("FFmpeg extraction failed: {}", e); } } } fn extract_subtitles_ffmpeg(&self, url: &str, sender: &ComponentSender) -> Result<()> { info!("Starting subtitle extraction from: {}", url); let mut input = ffmpeg::format::input(&url)?; // Log input format info info!( "Input format: {} ({} streams)", input.format().name(), input.streams().count() ); // Check if whisper filter is available if let Some(whisper_filter) = ffmpeg::filter::find("whisper") { info!("Whisper filter found: {}", whisper_filter.name()); } else { warn!("Whisper filter not found - audio transcription will be skipped"); } let mut subtitle_decoders = BTreeMap::new(); let mut audio_decoder: Option = None; let mut _whisper_filter_graph: Option = None; let mut whisper_source: Option = None; let mut whisper_sink: Option = None; let mut best_audio_stream_index: Option = None; // Find best audio stream for whisper processing if let Some(audio_stream) = input.streams().best(ffmpeg::media::Type::Audio) { best_audio_stream_index = Some(audio_stream.index()); // Get audio parameters safely let codec_id = audio_stream.parameters().id(); let channels = if let Ok(context) = ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) { if let Ok(audio) = context.decoder().audio() { audio.channels() } else { 0 } } else { 0 }; let sample_rate = if let Ok(context) = ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) { if let Ok(audio) = context.decoder().audio() { audio.rate() } else { 0 } } else { 0 }; info!( "Found best audio stream: index {} (codec: {:?}, channels: {}, sample_rate: {})", audio_stream.index(), codec_id, channels, sample_rate ); } else { info!("No audio stream found for whisper processing"); } // Set up whisper filter graph if we found an audio stream if let Some(audio_index) = best_audio_stream_index { info!("Setting up whisper filter for audio stream {}", audio_index); let audio_stream = input.stream(audio_index).unwrap(); if let Ok(context) = ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) { if let Ok(decoder) = context.decoder().audio() { // Get decoder properties before moving it let decoder_rate = decoder.rate(); let decoder_format = decoder.format(); let decoder_channel_layout = decoder.channel_layout().bits(); audio_decoder = Some(decoder); // Set up whisper filter graph debug!("Creating whisper filter graph..."); debug!( "Audio stream time_base: {}, decoder rate: {}, format: {:?}, channel_layout: 0x{:x}", audio_stream.time_base(), decoder_rate, decoder_format, decoder_channel_layout ); match self.setup_whisper_filter(&audio_stream) { Ok((graph, source, sink)) => { info!("Whisper filter graph created successfully"); _whisper_filter_graph = Some(graph); whisper_source = Some(source); whisper_sink = Some(sink); debug!("Whisper source and sink contexts stored"); // Create a generated subtitle track let track = SubtitleTrack { language: Some(isolang::Language::from_639_1("en").unwrap_or_else( || isolang::Language::from_639_3("eng").unwrap(), )), title: Some("Generated from Audio (Whisper)".to_string()), cues: Vec::new(), is_generated: true, }; let whisper_stream_index = 1000; // Use high index for generated tracks TRACKS.write().insert(whisper_stream_index, track); sender .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata( whisper_stream_index, )) .unwrap(); } Err(e) => { error!("Failed to setup whisper filter: {}", e); debug!("Whisper filter error details: {:?}", e); warn!( "Audio transcription will be skipped due to filter setup failure" ); } } } } } // Create decoder for each subtitle stream for (stream_index, stream) in input.streams().enumerate() { if stream.parameters().medium() == ffmpeg::media::Type::Subtitle { let language_code = stream.metadata().get("language").map(|s| s.to_string()); let title = stream.metadata().get("title").map(|s| s.to_string()); let track = SubtitleTrack { language: language_code.and_then(|code| isolang::Language::from_639_2b(&code)), title, cues: Vec::new(), is_generated: false, }; TRACKS.write().insert(stream_index, track); sender .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata( stream_index, )) .unwrap(); let context = ffmpeg::codec::context::Context::from_parameters(stream.parameters())?; if let Ok(decoder) = context.decoder().subtitle() { subtitle_decoders.insert(stream_index, decoder); debug!("Created decoder for subtitle stream {}", stream_index); } else { error!( "Failed to create decoder for subtitle stream {}", stream_index ); } } else { debug!( "Failed to create context for subtitle stream {}", stream_index ); } } // Process packets for (stream, packet) in input.packets() { let stream_index = stream.index(); // Process subtitle packets if let Some(decoder) = subtitle_decoders.get_mut(&stream_index) { let mut subtitle = ffmpeg::Subtitle::new(); if decoder.decode(&packet, &mut subtitle).is_ok() { if let Some(cue) = Self::subtitle_to_cue(&subtitle, &packet, stream.time_base()) { if let Some(track) = TRACKS.write().get_mut(&stream_index) { track.cues.push(cue.clone()); } sender .output(SubtitleExtractorOutput::NewCue(stream_index, cue)) .unwrap(); } } } // Process audio packets for whisper if Some(stream_index) == best_audio_stream_index { debug!( "Processing audio packet for whisper (stream: {}, pts: {:?}, duration: {:?})", stream_index, packet.pts(), packet.duration() ); debug!( "Audio decoder available: {}, Whisper source available: {}", audio_decoder.is_some(), whisper_source.is_some() ); if let (Some(decoder), Some(source)) = (&mut audio_decoder, &mut whisper_source) { debug!("Both audio decoder and whisper source are available, processing..."); // Send packet to audio decoder if let Err(e) = decoder.send_packet(&packet) { debug!("Failed to send packet to audio decoder: {}", e); } // Get decoded frames and send to whisper filter let mut frame = unsafe { ffmpeg::Frame::empty() }; let mut frame_count = 0; while decoder.receive_frame(&mut frame).is_ok() { frame_count += 1; debug!( "Decoded audio frame {} (pts: {:?})", frame_count, frame.pts() ); // Add frame to whisper filter if let Err(e) = source.source().add(&frame) { error!("Failed to add frame to whisper filter: {}", e); } else { debug!("Successfully added frame to whisper filter"); } // Check for whisper output after adding each frame if let Some(sink) = &mut whisper_sink { self.check_whisper_output(sink, sender)?; } } if frame_count > 0 { debug!("Processed {} audio frames for whisper", frame_count); } } else { debug!("Skipping audio packet - decoder or whisper source not available"); } } } // Flush audio decoder and whisper filter if let (Some(decoder), Some(source), Some(sink)) = (&mut audio_decoder, &mut whisper_source, &mut whisper_sink) { info!("Flushing audio decoder and whisper filter..."); // Flush decoder if let Err(e) = decoder.send_eof() { debug!("Failed to send EOF to decoder: {}", e); } let mut frame = unsafe { ffmpeg::Frame::empty() }; let mut final_frame_count = 0; while decoder.receive_frame(&mut frame).is_ok() { final_frame_count += 1; source.source().add(&frame).ok(); } debug!("Flushed {} final frames from decoder", final_frame_count); // Flush filter and get results debug!("Flushing whisper filter..."); if let Err(e) = source.source().flush() { error!("Failed to flush whisper filter: {}", e); } info!("Processing final whisper filter output..."); self.check_whisper_output(sink, sender)?; } Ok(()) } fn setup_whisper_filter( &self, audio_stream: &ffmpeg::Stream, ) -> Result<( ffmpeg::filter::Graph, ffmpeg::filter::Context, ffmpeg::filter::Context, )> { debug!("Setting up whisper filter graph..."); let mut filter_graph = ffmpeg::filter::Graph::new(); debug!("Filter graph created successfully"); // Get audio parameters debug!("Getting audio parameters..."); let time_base = audio_stream.time_base(); let audio_params = audio_stream.parameters(); debug!("Creating context from parameters..."); let context = ffmpeg::codec::context::Context::from_parameters(audio_params)?; debug!("Getting audio decoder from context..."); let audio_decoder = context.decoder().audio()?; debug!("Audio decoder created successfully"); // Create buffer source let buffer_args = format!( "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}", time_base, audio_decoder.rate(), audio_decoder.format().name(), audio_decoder.channel_layout().bits() ); debug!("Buffer args: {}", buffer_args); debug!("Looking for abuffer filter..."); let abuffer_filter = ffmpeg::filter::find("abuffer") .ok_or_else(|| anyhow::anyhow!("abuffer filter not found"))?; debug!("abuffer filter found: {}", abuffer_filter.name()); debug!("Adding abuffer filter..."); match filter_graph.add(&abuffer_filter, "src", &buffer_args) { Ok(_) => debug!("abuffer filter added successfully"), Err(e) => { error!("Failed to add abuffer filter: {}", e); return Err(anyhow::anyhow!("Failed to add abuffer filter: {}", e)); } } // Create whisper filter with parameters // Try absolute path and different parameter formats let model_path = std::path::Path::new("./whisper-models/ggml-large-v3.bin"); let absolute_path = if model_path.exists() { model_path .canonicalize() .map(|p| p.to_string_lossy().to_string()) .unwrap_or_else(|_| "./whisper-models/ggml-large-v3.bin".to_string()) } else { warn!("Whisper model file not found at: {:?}", model_path); "./whisper-models/ggml-large-v3.bin".to_string() }; debug!("Model path exists: {}", model_path.exists()); debug!("Using absolute path: {}", absolute_path); debug!("Looking for whisper filter..."); let whisper_filter = ffmpeg::filter::find("whisper").ok_or_else(|| { error!("Whisper filter not found! Make sure FFmpeg was compiled with whisper support"); anyhow::anyhow!("Whisper filter not available") })?; debug!("Whisper filter found: {}", whisper_filter.name()); // We'll create the whisper filter through the parse method instead of adding it manually // Create audio buffer sink for whisper output (whisper outputs audio + metadata) debug!("Looking for abuffersink filter for audio output..."); let abuffersink_filter = ffmpeg::filter::find("abuffersink") .ok_or_else(|| anyhow::anyhow!("abuffersink filter not found"))?; debug!("abuffersink filter found: {}", abuffersink_filter.name()); debug!("Adding abuffersink filter..."); match filter_graph.add(&abuffersink_filter, "sink", "") { Ok(_) => debug!("abuffersink filter added successfully"), Err(e) => { error!("Failed to add abuffersink filter: {}", e); return Err(anyhow::anyhow!("Failed to add abuffersink filter: {}", e)); } } // Connect filters using the complete filter chain description debug!("Connecting filter graph with complete chain: src -> whisper -> sink"); let filter_chain = format!( "[src]whisper=model={}:queue=30:format=json[sink]", "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", //"/Users/malte/repos/lleap/whisper-models/ggml-silero-v5.1.2.bin" ); debug!("Using filter chain: {}", filter_chain); if let Err(e) = filter_graph .output("src", 0) .and_then(|o| o.input("sink", 0)) .and_then(|i| i.parse(&filter_chain)) { error!("Failed to connect filter graph: {}", e); return Err(anyhow::anyhow!("Failed to connect filter graph: {}", e)); } debug!("Filter graph connected successfully"); // Validate filter graph debug!("Validating filter graph..."); match filter_graph.validate() { Ok(_) => { info!("Filter graph validated successfully"); debug!("Filter graph dump:\n{}", filter_graph.dump()); } Err(e) => { error!("Filter graph validation failed: {}", e); debug!( "Filter graph dump before validation failure:\n{}", filter_graph.dump() ); return Err(anyhow::anyhow!("Filter graph validation failed: {}", e)); } } debug!("Getting final source and sink contexts..."); let source_ctx = filter_graph .get("src") .ok_or_else(|| anyhow::anyhow!("Source context not found"))?; let sink_ctx = filter_graph .get("sink") .ok_or_else(|| anyhow::anyhow!("Sink context not found"))?; debug!("Final contexts retrieved successfully"); Ok((filter_graph, source_ctx, sink_ctx)) } fn check_whisper_output( &self, sink: &mut ffmpeg::filter::Context, sender: &ComponentSender, ) -> Result<()> { debug!("Attempting to read audio frames from whisper filter output..."); // The whisper filter outputs audio frames with subtitle data in "lavfi.whisper.text" metadata let mut frame = unsafe { ffmpeg::Frame::empty() }; let mut output_count = 0; while sink.sink().frame(&mut frame).is_ok() { output_count += 1; debug!( "Received audio frame {} from whisper filter (pts: {:?})", output_count, frame.pts() ); // Look specifically for lavfi.whisper.text metadata if let Some(whisper_text) = frame.metadata().get("lavfi.whisper.text") { info!("Found whisper transcription: {}", whisper_text); let start_time = if let Some(pts) = frame.pts() { // Convert PTS to nanoseconds based on whisper filter's time base (16kHz) gst::ClockTime::from_nseconds((pts as u64 * 1_000_000_000) / 16000) } else { gst::ClockTime::ZERO }; // Log all available metadata keys to help debug let metadata_entries: Vec<(String, String)> = frame .metadata() .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(); if !metadata_entries.is_empty() { let metadata_keys: Vec = metadata_entries.iter().map(|(k, _)| k.clone()).collect(); debug!("Frame metadata keys: {:?}", metadata_keys); } // Parse the whisper text (might be JSON format) self.parse_whisper_text(whisper_text, start_time, sender)?; } } if output_count > 0 { info!("Processed {} frames from whisper filter", output_count); } else { debug!("No frames available from whisper filter"); } Ok(()) } fn parse_whisper_text( &self, whisper_text: &str, base_time: gst::ClockTime, sender: &ComponentSender, ) -> Result<()> { debug!("Parsing whisper text: {}", whisper_text); // The whisper text might be in different formats depending on the filter configuration // For now, treat it as plain text and create a single cue let cue = SubtitleCue { start: base_time, end: base_time + gst::ClockTime::from_seconds(3), // Default 3 second duration text: whisper_text.to_string(), }; let whisper_stream_index = 1000; if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) { track.cues.push(cue.clone()); } sender .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue)) .unwrap(); Ok(()) } fn parse_whisper_subtitle_data( &self, subtitle_data: &str, sender: &ComponentSender, ) -> Result<()> { // Parse SRT-format output from whisper info!( "Parsing whisper subtitle data ({} characters)", subtitle_data.len() ); debug!("Subtitle data content:\n{}", subtitle_data); let lines: Vec<&str> = subtitle_data.lines().collect(); let mut i = 0; while i < lines.len() { // Skip subtitle number if lines[i].trim().parse::().is_ok() { i += 1; } // Parse timestamp line if i < lines.len() { if let Some((start, end)) = self.parse_srt_timestamp(lines[i]) { i += 1; // Collect text lines let mut text_lines = Vec::new(); while i < lines.len() && !lines[i].trim().is_empty() { text_lines.push(lines[i].to_string()); i += 1; } if !text_lines.is_empty() { let cue = SubtitleCue { start, end, text: text_lines.join("\n"), }; let whisper_stream_index = 1000; if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) { track.cues.push(cue.clone()); } sender .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue)) .unwrap(); } } } i += 1; } Ok(()) } fn parse_srt_timestamp(&self, line: &str) -> Option<(gst::ClockTime, gst::ClockTime)> { // Parse SRT timestamp format: "00:00:01,234 --> 00:00:05,678" let parts: Vec<&str> = line.split(" --> ").collect(); if parts.len() != 2 { return None; } let start = self.parse_srt_time(parts[0])?; let end = self.parse_srt_time(parts[1])?; Some((start, end)) } fn parse_srt_time(&self, time_str: &str) -> Option { // Parse SRT time format: "00:00:01,234" let parts: Vec<&str> = time_str.split(',').collect(); if parts.len() != 2 { return None; } let time_part = parts[0]; let millis: u32 = parts[1].parse().ok()?; let time_components: Vec<&str> = time_part.split(':').collect(); if time_components.len() != 3 { return None; } let hours: u32 = time_components[0].parse().ok()?; let minutes: u32 = time_components[1].parse().ok()?; let seconds: u32 = time_components[2].parse().ok()?; let total_millis = hours * 3600000 + minutes * 60000 + seconds * 1000 + millis; let nanoseconds = total_millis as u64 * 1_000_000; Some(gst::ClockTime::from_nseconds(nanoseconds)) } fn subtitle_to_cue( subtitle: &ffmpeg::Subtitle, packet: &ffmpeg::Packet, time_base: Rational, ) -> Option { let time_to_clock_time = |time: i64| { let nseconds: i64 = (time * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64; gst::ClockTime::from_nseconds(nseconds as u64) }; let text = subtitle .rects() .into_iter() .map(|rect| match rect { ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(), ffmpeg::subtitle::Rect::Ass(ass) => { Self::extract_dialogue_text(ass.get()).unwrap_or(String::new()) } _ => String::new(), }) .collect::>() .join("\n— "); let start = time_to_clock_time(packet.pts()?); let end = time_to_clock_time(packet.pts()? + packet.duration()); Some(SubtitleCue { start, end, text }) } fn extract_dialogue_text(dialogue_line: &str) -> Option { // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text // we need the 9th field (Text), so split on comma but only take first 9 splits // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433 let text = dialogue_line.splitn(9, ',').last()?; // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc. let mut result = String::new(); let mut in_tag = false; let mut char_iter = text.chars().peekable(); while let Some(c) = char_iter.next() { if c == '{' && char_iter.peek() == Some(&'\\') { in_tag = true; } else if c == '}' { in_tag = false; } else if !in_tag { // process line breaks and hard spaces if c == '\\' { match char_iter.peek() { Some(&'N') => { char_iter.next(); result.push('\n'); } Some(&'n') | Some(&'h') => { char_iter.next(); result.push(' '); } _ => result.push(c), } } else { result.push(c); } } } Some(result) } }