diff options
Diffstat (limited to 'src/subtitle_extractor_aishit.rs')
| -rw-r--r-- | src/subtitle_extractor_aishit.rs | 732 |
1 files changed, 732 insertions, 0 deletions
diff --git a/src/subtitle_extractor_aishit.rs b/src/subtitle_extractor_aishit.rs new file mode 100644 index 0000000..c615f6c --- /dev/null +++ b/src/subtitle_extractor_aishit.rs @@ -0,0 +1,732 @@ +use std::collections::BTreeMap; + +use anyhow::Result; + +use ffmpeg::Rational; +use log::{debug, error, info, warn}; +use relm4::{ComponentSender, SharedState, Worker}; + +pub type StreamIndex = usize; + +#[derive(Debug, Clone)] +pub struct SubtitleCue { + pub start: gst::ClockTime, + pub end: gst::ClockTime, + pub text: String, +} + +#[derive(Debug, Clone)] +pub struct SubtitleTrack { + pub language: Option<isolang::Language>, + pub title: Option<String>, + pub cues: Vec<SubtitleCue>, + pub is_generated: bool, // true if generated from audio +} + +pub static TRACKS: SharedState<BTreeMap<StreamIndex, SubtitleTrack>> = SharedState::new(); + +pub struct SubtitleExtractor {} + +#[derive(Debug)] +pub enum SubtitleExtractorMsg { + ExtractFromUrl(String), +} + +#[derive(Debug)] +pub enum SubtitleExtractorOutput { + NewOrUpdatedTrackMetadata(StreamIndex), + NewCue(StreamIndex, SubtitleCue), + ExtractionComplete, +} + +impl Worker for SubtitleExtractor { + type Init = (); + type Input = SubtitleExtractorMsg; + type Output = SubtitleExtractorOutput; + + fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self { + Self {} + } + + fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) { + match msg { + SubtitleExtractorMsg::ExtractFromUrl(url) => { + self.handle_extract_from_url(url, sender); + } + } + } +} + +impl SubtitleExtractor { + fn handle_extract_from_url(&mut self, url: String, sender: ComponentSender<Self>) { + // Clear existing tracks + TRACKS.write().clear(); + + // Try to extract subtitles using ffmpeg + match self.extract_subtitles_ffmpeg(&url, &sender) { + Ok(_) => { + info!("Subtitle extraction completed successfully"); + sender + .output(SubtitleExtractorOutput::ExtractionComplete) + .unwrap(); + } + Err(e) => { + error!("FFmpeg extraction failed: {}", e); + } + } + } + + fn extract_subtitles_ffmpeg(&self, url: &str, sender: &ComponentSender<Self>) -> Result<()> { + info!("Starting subtitle extraction from: {}", url); + let mut input = ffmpeg::format::input(&url)?; + + // Log input format info + info!( + "Input format: {} ({} streams)", + input.format().name(), + input.streams().count() + ); + + // Check if whisper filter is available + if let Some(whisper_filter) = ffmpeg::filter::find("whisper") { + info!("Whisper filter found: {}", whisper_filter.name()); + } else { + warn!("Whisper filter not found - audio transcription will be skipped"); + } + + let mut subtitle_decoders = BTreeMap::new(); + let mut audio_decoder: Option<ffmpeg::decoder::Audio> = None; + let mut _whisper_filter_graph: Option<ffmpeg::filter::Graph> = None; + let mut whisper_source: Option<ffmpeg::filter::Context> = None; + let mut whisper_sink: Option<ffmpeg::filter::Context> = None; + let mut best_audio_stream_index: Option<usize> = None; + + // Find best audio stream for whisper processing + if let Some(audio_stream) = input.streams().best(ffmpeg::media::Type::Audio) { + best_audio_stream_index = Some(audio_stream.index()); + + // Get audio parameters safely + let codec_id = audio_stream.parameters().id(); + let channels = if let Ok(context) = + ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) + { + if let Ok(audio) = context.decoder().audio() { + audio.channels() + } else { + 0 + } + } else { + 0 + }; + let sample_rate = if let Ok(context) = + ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) + { + if let Ok(audio) = context.decoder().audio() { + audio.rate() + } else { + 0 + } + } else { + 0 + }; + + info!( + "Found best audio stream: index {} (codec: {:?}, channels: {}, sample_rate: {})", + audio_stream.index(), + codec_id, + channels, + sample_rate + ); + } else { + info!("No audio stream found for whisper processing"); + } + + // Set up whisper filter graph if we found an audio stream + if let Some(audio_index) = best_audio_stream_index { + info!("Setting up whisper filter for audio stream {}", audio_index); + + let audio_stream = input.stream(audio_index).unwrap(); + if let Ok(context) = + ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters()) + { + if let Ok(decoder) = context.decoder().audio() { + // Get decoder properties before moving it + let decoder_rate = decoder.rate(); + let decoder_format = decoder.format(); + let decoder_channel_layout = decoder.channel_layout().bits(); + + audio_decoder = Some(decoder); + + // Set up whisper filter graph + debug!("Creating whisper filter graph..."); + debug!( + "Audio stream time_base: {}, decoder rate: {}, format: {:?}, channel_layout: 0x{:x}", + audio_stream.time_base(), + decoder_rate, + decoder_format, + decoder_channel_layout + ); + match self.setup_whisper_filter(&audio_stream) { + Ok((graph, source, sink)) => { + info!("Whisper filter graph created successfully"); + _whisper_filter_graph = Some(graph); + whisper_source = Some(source); + whisper_sink = Some(sink); + debug!("Whisper source and sink contexts stored"); + + // Create a generated subtitle track + let track = SubtitleTrack { + language: Some(isolang::Language::from_639_1("en").unwrap_or_else( + || isolang::Language::from_639_3("eng").unwrap(), + )), + title: Some("Generated from Audio (Whisper)".to_string()), + cues: Vec::new(), + is_generated: true, + }; + + let whisper_stream_index = 1000; // Use high index for generated tracks + TRACKS.write().insert(whisper_stream_index, track); + + sender + .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata( + whisper_stream_index, + )) + .unwrap(); + } + Err(e) => { + error!("Failed to setup whisper filter: {}", e); + debug!("Whisper filter error details: {:?}", e); + warn!( + "Audio transcription will be skipped due to filter setup failure" + ); + } + } + } + } + } + + // Create decoder for each subtitle stream + for (stream_index, stream) in input.streams().enumerate() { + if stream.parameters().medium() == ffmpeg::media::Type::Subtitle { + let language_code = stream.metadata().get("language").map(|s| s.to_string()); + let title = stream.metadata().get("title").map(|s| s.to_string()); + + let track = SubtitleTrack { + language: language_code.and_then(|code| isolang::Language::from_639_2b(&code)), + title, + cues: Vec::new(), + is_generated: false, + }; + + TRACKS.write().insert(stream_index, track); + + sender + .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata( + stream_index, + )) + .unwrap(); + + let context = + ffmpeg::codec::context::Context::from_parameters(stream.parameters())?; + if let Ok(decoder) = context.decoder().subtitle() { + subtitle_decoders.insert(stream_index, decoder); + debug!("Created decoder for subtitle stream {}", stream_index); + } else { + error!( + "Failed to create decoder for subtitle stream {}", + stream_index + ); + } + } else { + debug!( + "Failed to create context for subtitle stream {}", + stream_index + ); + } + } + + // Process packets + for (stream, packet) in input.packets() { + let stream_index = stream.index(); + + // Process subtitle packets + if let Some(decoder) = subtitle_decoders.get_mut(&stream_index) { + let mut subtitle = ffmpeg::Subtitle::new(); + if decoder.decode(&packet, &mut subtitle).is_ok() { + if let Some(cue) = Self::subtitle_to_cue(&subtitle, &packet, stream.time_base()) + { + if let Some(track) = TRACKS.write().get_mut(&stream_index) { + track.cues.push(cue.clone()); + } + + sender + .output(SubtitleExtractorOutput::NewCue(stream_index, cue)) + .unwrap(); + } + } + } + + // Process audio packets for whisper + if Some(stream_index) == best_audio_stream_index { + debug!( + "Processing audio packet for whisper (stream: {}, pts: {:?}, duration: {:?})", + stream_index, + packet.pts(), + packet.duration() + ); + debug!( + "Audio decoder available: {}, Whisper source available: {}", + audio_decoder.is_some(), + whisper_source.is_some() + ); + if let (Some(decoder), Some(source)) = (&mut audio_decoder, &mut whisper_source) { + debug!("Both audio decoder and whisper source are available, processing..."); + // Send packet to audio decoder + if let Err(e) = decoder.send_packet(&packet) { + debug!("Failed to send packet to audio decoder: {}", e); + } + + // Get decoded frames and send to whisper filter + let mut frame = unsafe { ffmpeg::Frame::empty() }; + let mut frame_count = 0; + while decoder.receive_frame(&mut frame).is_ok() { + frame_count += 1; + debug!( + "Decoded audio frame {} (pts: {:?})", + frame_count, + frame.pts() + ); + + // Add frame to whisper filter + if let Err(e) = source.source().add(&frame) { + error!("Failed to add frame to whisper filter: {}", e); + } else { + debug!("Successfully added frame to whisper filter"); + } + + // Check for whisper output after adding each frame + if let Some(sink) = &mut whisper_sink { + self.check_whisper_output(sink, sender)?; + } + } + if frame_count > 0 { + debug!("Processed {} audio frames for whisper", frame_count); + } + } else { + debug!("Skipping audio packet - decoder or whisper source not available"); + } + } + } + + // Flush audio decoder and whisper filter + if let (Some(decoder), Some(source), Some(sink)) = + (&mut audio_decoder, &mut whisper_source, &mut whisper_sink) + { + info!("Flushing audio decoder and whisper filter..."); + // Flush decoder + if let Err(e) = decoder.send_eof() { + debug!("Failed to send EOF to decoder: {}", e); + } + let mut frame = unsafe { ffmpeg::Frame::empty() }; + let mut final_frame_count = 0; + while decoder.receive_frame(&mut frame).is_ok() { + final_frame_count += 1; + source.source().add(&frame).ok(); + } + debug!("Flushed {} final frames from decoder", final_frame_count); + + // Flush filter and get results + debug!("Flushing whisper filter..."); + if let Err(e) = source.source().flush() { + error!("Failed to flush whisper filter: {}", e); + } + + info!("Processing final whisper filter output..."); + self.check_whisper_output(sink, sender)?; + } + + Ok(()) + } + + fn setup_whisper_filter( + &self, + audio_stream: &ffmpeg::Stream, + ) -> Result<( + ffmpeg::filter::Graph, + ffmpeg::filter::Context, + ffmpeg::filter::Context, + )> { + debug!("Setting up whisper filter graph..."); + let mut filter_graph = ffmpeg::filter::Graph::new(); + debug!("Filter graph created successfully"); + + // Get audio parameters + debug!("Getting audio parameters..."); + let time_base = audio_stream.time_base(); + let audio_params = audio_stream.parameters(); + debug!("Creating context from parameters..."); + let context = ffmpeg::codec::context::Context::from_parameters(audio_params)?; + debug!("Getting audio decoder from context..."); + let audio_decoder = context.decoder().audio()?; + debug!("Audio decoder created successfully"); + + // Create buffer source + let buffer_args = format!( + "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}", + time_base, + audio_decoder.rate(), + audio_decoder.format().name(), + audio_decoder.channel_layout().bits() + ); + debug!("Buffer args: {}", buffer_args); + + debug!("Looking for abuffer filter..."); + let abuffer_filter = ffmpeg::filter::find("abuffer") + .ok_or_else(|| anyhow::anyhow!("abuffer filter not found"))?; + debug!("abuffer filter found: {}", abuffer_filter.name()); + + debug!("Adding abuffer filter..."); + match filter_graph.add(&abuffer_filter, "src", &buffer_args) { + Ok(_) => debug!("abuffer filter added successfully"), + Err(e) => { + error!("Failed to add abuffer filter: {}", e); + return Err(anyhow::anyhow!("Failed to add abuffer filter: {}", e)); + } + } + + // Create whisper filter with parameters + // Try absolute path and different parameter formats + let model_path = std::path::Path::new("./whisper-models/ggml-large-v3.bin"); + let absolute_path = if model_path.exists() { + model_path + .canonicalize() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_else(|_| "./whisper-models/ggml-large-v3.bin".to_string()) + } else { + warn!("Whisper model file not found at: {:?}", model_path); + "./whisper-models/ggml-large-v3.bin".to_string() + }; + + debug!("Model path exists: {}", model_path.exists()); + debug!("Using absolute path: {}", absolute_path); + + debug!("Looking for whisper filter..."); + let whisper_filter = ffmpeg::filter::find("whisper").ok_or_else(|| { + error!("Whisper filter not found! Make sure FFmpeg was compiled with whisper support"); + anyhow::anyhow!("Whisper filter not available") + })?; + + debug!("Whisper filter found: {}", whisper_filter.name()); + // We'll create the whisper filter through the parse method instead of adding it manually + + // Create audio buffer sink for whisper output (whisper outputs audio + metadata) + debug!("Looking for abuffersink filter for audio output..."); + let abuffersink_filter = ffmpeg::filter::find("abuffersink") + .ok_or_else(|| anyhow::anyhow!("abuffersink filter not found"))?; + debug!("abuffersink filter found: {}", abuffersink_filter.name()); + + debug!("Adding abuffersink filter..."); + match filter_graph.add(&abuffersink_filter, "sink", "") { + Ok(_) => debug!("abuffersink filter added successfully"), + Err(e) => { + error!("Failed to add abuffersink filter: {}", e); + return Err(anyhow::anyhow!("Failed to add abuffersink filter: {}", e)); + } + } + + // Connect filters using the complete filter chain description + debug!("Connecting filter graph with complete chain: src -> whisper -> sink"); + + let filter_chain = format!( + "[src]whisper=model={}:queue=30:format=json[sink]", + "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", + //"/Users/malte/repos/lleap/whisper-models/ggml-silero-v5.1.2.bin" + ); + debug!("Using filter chain: {}", filter_chain); + + if let Err(e) = filter_graph + .output("src", 0) + .and_then(|o| o.input("sink", 0)) + .and_then(|i| i.parse(&filter_chain)) + { + error!("Failed to connect filter graph: {}", e); + return Err(anyhow::anyhow!("Failed to connect filter graph: {}", e)); + } + debug!("Filter graph connected successfully"); + + // Validate filter graph + debug!("Validating filter graph..."); + match filter_graph.validate() { + Ok(_) => { + info!("Filter graph validated successfully"); + debug!("Filter graph dump:\n{}", filter_graph.dump()); + } + Err(e) => { + error!("Filter graph validation failed: {}", e); + debug!( + "Filter graph dump before validation failure:\n{}", + filter_graph.dump() + ); + return Err(anyhow::anyhow!("Filter graph validation failed: {}", e)); + } + } + + debug!("Getting final source and sink contexts..."); + let source_ctx = filter_graph + .get("src") + .ok_or_else(|| anyhow::anyhow!("Source context not found"))?; + let sink_ctx = filter_graph + .get("sink") + .ok_or_else(|| anyhow::anyhow!("Sink context not found"))?; + debug!("Final contexts retrieved successfully"); + + Ok((filter_graph, source_ctx, sink_ctx)) + } + + fn check_whisper_output( + &self, + sink: &mut ffmpeg::filter::Context, + sender: &ComponentSender<Self>, + ) -> Result<()> { + debug!("Attempting to read audio frames from whisper filter output..."); + + // The whisper filter outputs audio frames with subtitle data in "lavfi.whisper.text" metadata + let mut frame = unsafe { ffmpeg::Frame::empty() }; + let mut output_count = 0; + + while sink.sink().frame(&mut frame).is_ok() { + output_count += 1; + debug!( + "Received audio frame {} from whisper filter (pts: {:?})", + output_count, + frame.pts() + ); + + // Look specifically for lavfi.whisper.text metadata + if let Some(whisper_text) = frame.metadata().get("lavfi.whisper.text") { + info!("Found whisper transcription: {}", whisper_text); + + let start_time = if let Some(pts) = frame.pts() { + // Convert PTS to nanoseconds based on whisper filter's time base (16kHz) + gst::ClockTime::from_nseconds((pts as u64 * 1_000_000_000) / 16000) + } else { + gst::ClockTime::ZERO + }; + + // Log all available metadata keys to help debug + let metadata_entries: Vec<(String, String)> = frame + .metadata() + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + if !metadata_entries.is_empty() { + let metadata_keys: Vec<String> = + metadata_entries.iter().map(|(k, _)| k.clone()).collect(); + debug!("Frame metadata keys: {:?}", metadata_keys); + } + + // Parse the whisper text (might be JSON format) + self.parse_whisper_text(whisper_text, start_time, sender)?; + } + } + + if output_count > 0 { + info!("Processed {} frames from whisper filter", output_count); + } else { + debug!("No frames available from whisper filter"); + } + + Ok(()) + } + + fn parse_whisper_text( + &self, + whisper_text: &str, + base_time: gst::ClockTime, + sender: &ComponentSender<Self>, + ) -> Result<()> { + debug!("Parsing whisper text: {}", whisper_text); + + // The whisper text might be in different formats depending on the filter configuration + // For now, treat it as plain text and create a single cue + let cue = SubtitleCue { + start: base_time, + end: base_time + gst::ClockTime::from_seconds(3), // Default 3 second duration + text: whisper_text.to_string(), + }; + + let whisper_stream_index = 1000; + if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) { + track.cues.push(cue.clone()); + } + + sender + .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue)) + .unwrap(); + + Ok(()) + } + + fn parse_whisper_subtitle_data( + &self, + subtitle_data: &str, + sender: &ComponentSender<Self>, + ) -> Result<()> { + // Parse SRT-format output from whisper + info!( + "Parsing whisper subtitle data ({} characters)", + subtitle_data.len() + ); + debug!("Subtitle data content:\n{}", subtitle_data); + let lines: Vec<&str> = subtitle_data.lines().collect(); + let mut i = 0; + + while i < lines.len() { + // Skip subtitle number + if lines[i].trim().parse::<i32>().is_ok() { + i += 1; + } + + // Parse timestamp line + if i < lines.len() { + if let Some((start, end)) = self.parse_srt_timestamp(lines[i]) { + i += 1; + + // Collect text lines + let mut text_lines = Vec::new(); + while i < lines.len() && !lines[i].trim().is_empty() { + text_lines.push(lines[i].to_string()); + i += 1; + } + + if !text_lines.is_empty() { + let cue = SubtitleCue { + start, + end, + text: text_lines.join("\n"), + }; + + let whisper_stream_index = 1000; + if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) { + track.cues.push(cue.clone()); + } + + sender + .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue)) + .unwrap(); + } + } + } + i += 1; + } + + Ok(()) + } + + fn parse_srt_timestamp(&self, line: &str) -> Option<(gst::ClockTime, gst::ClockTime)> { + // Parse SRT timestamp format: "00:00:01,234 --> 00:00:05,678" + let parts: Vec<&str> = line.split(" --> ").collect(); + if parts.len() != 2 { + return None; + } + + let start = self.parse_srt_time(parts[0])?; + let end = self.parse_srt_time(parts[1])?; + + Some((start, end)) + } + + fn parse_srt_time(&self, time_str: &str) -> Option<gst::ClockTime> { + // Parse SRT time format: "00:00:01,234" + let parts: Vec<&str> = time_str.split(',').collect(); + if parts.len() != 2 { + return None; + } + + let time_part = parts[0]; + let millis: u32 = parts[1].parse().ok()?; + + let time_components: Vec<&str> = time_part.split(':').collect(); + if time_components.len() != 3 { + return None; + } + + let hours: u32 = time_components[0].parse().ok()?; + let minutes: u32 = time_components[1].parse().ok()?; + let seconds: u32 = time_components[2].parse().ok()?; + + let total_millis = hours * 3600000 + minutes * 60000 + seconds * 1000 + millis; + let nanoseconds = total_millis as u64 * 1_000_000; + + Some(gst::ClockTime::from_nseconds(nanoseconds)) + } + + fn subtitle_to_cue( + subtitle: &ffmpeg::Subtitle, + packet: &ffmpeg::Packet, + time_base: Rational, + ) -> Option<SubtitleCue> { + let time_to_clock_time = |time: i64| { + let nseconds: i64 = (time * time_base.numerator() as i64 * 1_000_000_000) + / time_base.denominator() as i64; + gst::ClockTime::from_nseconds(nseconds as u64) + }; + + let text = subtitle + .rects() + .into_iter() + .map(|rect| match rect { + ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(), + ffmpeg::subtitle::Rect::Ass(ass) => { + Self::extract_dialogue_text(ass.get()).unwrap_or(String::new()) + } + _ => String::new(), + }) + .collect::<Vec<String>>() + .join("\n— "); + + let start = time_to_clock_time(packet.pts()?); + let end = time_to_clock_time(packet.pts()? + packet.duration()); + + Some(SubtitleCue { start, end, text }) + } + + fn extract_dialogue_text(dialogue_line: &str) -> Option<String> { + // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text + // we need the 9th field (Text), so split on comma but only take first 9 splits + // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433 + let text = dialogue_line.splitn(9, ',').last()?; + + // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc. + let mut result = String::new(); + let mut in_tag = false; + let mut char_iter = text.chars().peekable(); + + while let Some(c) = char_iter.next() { + if c == '{' && char_iter.peek() == Some(&'\\') { + in_tag = true; + } else if c == '}' { + in_tag = false; + } else if !in_tag { + // process line breaks and hard spaces + if c == '\\' { + match char_iter.peek() { + Some(&'N') => { + char_iter.next(); + result.push('\n'); + } + Some(&'n') | Some(&'h') => { + char_iter.next(); + result.push(' '); + } + _ => result.push(c), + } + } else { + result.push(c); + } + } + } + + Some(result) + } +} |