aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMalte Voos <git@mal.tc>2025-11-21 23:25:41 +0100
committerMalte Voos <git@mal.tc>2025-11-21 23:25:41 +0100
commit90a1d5729c32910c249460cfe56ad682fd3fd608 (patch)
tree4afeee7738986aebeece84a6944b43b32d0122d2
parent016b76acba13e86df59f818581aa61f7bbaffff8 (diff)
downloadlleap-90a1d5729c32910c249460cfe56ad682fd3fd608.tar.gz
lleap-90a1d5729c32910c249460cfe56ad682fd3fd608.zip
overhaul autopausing
-rw-r--r--src/app.rs157
-rw-r--r--src/subtitle_extractor_aishit.rs732
2 files changed, 91 insertions, 798 deletions
diff --git a/src/app.rs b/src/app.rs
index 7aa5abd..066980c 100644
--- a/src/app.rs
+++ b/src/app.rs
@@ -12,7 +12,7 @@ use crate::{
subtitle_view::{SubtitleView, SubtitleViewMsg, SubtitleViewOutput},
tracks::{SUBTITLE_TRACKS, StreamIndex, SubtitleCue},
transcript::{Transcript, TranscriptMsg, TranscriptOutput},
- util::OptionTracker,
+ util::{OptionTracker, Tracker},
};
pub struct App {
@@ -26,13 +26,14 @@ pub struct App {
subtitle_selection_dialog: Controller<SubtitleSelectionDialog>,
primary_stream_ix: Option<StreamIndex>,
- primary_last_cue_ix: OptionTracker<usize>,
+ primary_cue: Tracker<Option<String>>,
+ primary_last_cue_ix: Tracker<Option<usize>>,
+ secondary_cue: Tracker<Option<String>>,
secondary_stream_ix: Option<StreamIndex>,
- secondary_last_cue_ix: OptionTracker<usize>,
+ secondary_last_cue_ix: Tracker<Option<usize>>,
// for auto-pausing
autopaused: bool,
- primary_cue_active: bool,
hovering_primary_cue: bool,
}
@@ -162,12 +163,13 @@ impl SimpleComponent for App {
subtitle_selection_dialog,
primary_stream_ix: None,
- primary_last_cue_ix: OptionTracker::new(None),
+ primary_cue: Tracker::new(None),
+ primary_last_cue_ix: Tracker::new(None),
secondary_stream_ix: None,
- secondary_last_cue_ix: OptionTracker::new(None),
+ secondary_cue: Tracker::new(None),
+ secondary_last_cue_ix: Tracker::new(None),
autopaused: false,
- primary_cue_active: false,
hovering_primary_cue: false,
};
@@ -177,9 +179,6 @@ impl SimpleComponent for App {
}
fn update(&mut self, message: Self::Input, _sender: ComponentSender<Self>) {
- self.primary_last_cue_ix.reset();
- self.secondary_last_cue_ix.reset();
-
match message {
AppMsg::NewCue(stream_index, cue) => {
self.transcript
@@ -203,20 +202,41 @@ impl SimpleComponent for App {
}
AppMsg::PositionUpdate(pos) => {
if let Some(stream_ix) = self.primary_stream_ix {
- let cue =
- Self::get_cue_and_update_ix(stream_ix, pos, &mut self.primary_last_cue_ix);
- let cue_is_some = cue.is_some();
-
- // beginning of new subtitle
- if self.primary_last_cue_ix.is_dirty()
- || (!self.primary_cue_active && cue_is_some)
- {
+ // sometimes we get a few position update messages after
+ // auto-pausing; this prevents us from immediately un-autopausing
+ // again
+ if self.autopaused {
+ return;
+ }
+
+ let cue_was_some = self.primary_cue.get().is_some();
+
+ Self::update_cue(
+ stream_ix,
+ pos,
+ &mut self.primary_cue,
+ &mut self.primary_last_cue_ix,
+ );
+
+ if self.primary_cue.is_dirty() {
+ // last cue just ended -> auto-pause
+ if cue_was_some && self.hovering_primary_cue {
+ self.player.sender().send(PlayerMsg::Pause).unwrap();
+ self.autopaused = true;
+ return;
+ }
+
self.subtitle_view
.sender()
- .send(SubtitleViewMsg::SetPrimaryCue(cue))
+ .send(SubtitleViewMsg::SetPrimaryCue(
+ self.primary_cue.get().clone(),
+ ))
.unwrap();
- self.primary_cue_active = cue_is_some;
+ self.primary_cue.reset();
+ }
+
+ if self.primary_last_cue_ix.is_dirty() {
if let Some(ix) = self.primary_last_cue_ix.get() {
self.transcript
.sender()
@@ -226,33 +246,24 @@ impl SimpleComponent for App {
self.primary_last_cue_ix.reset();
}
-
- // end of current subtitle
- if self.primary_cue_active && !cue_is_some && !self.autopaused {
- if self.hovering_primary_cue {
- self.player.sender().send(PlayerMsg::Pause).unwrap();
- self.autopaused = true;
- } else {
- self.subtitle_view
- .sender()
- .send(SubtitleViewMsg::SetPrimaryCue(None))
- .unwrap();
- self.primary_cue_active = false;
- }
- }
}
if let Some(stream_ix) = self.secondary_stream_ix {
- if !self.autopaused {
+ Self::update_cue(
+ stream_ix,
+ pos,
+ &mut self.secondary_cue,
+ &mut self.secondary_last_cue_ix,
+ );
+
+ if !self.autopaused && self.secondary_cue.is_dirty() {
self.subtitle_view
.sender()
.send(SubtitleViewMsg::SetSecondaryCue(
- Self::get_cue_and_update_ix(
- stream_ix,
- pos,
- &mut self.secondary_last_cue_ix,
- ),
+ self.secondary_cue.get().clone(),
))
.unwrap();
+
+ self.secondary_cue.reset();
}
}
}
@@ -302,50 +313,64 @@ impl SimpleComponent for App {
}
impl App {
- fn get_cue_and_update_ix(
+ fn update_cue(
stream_ix: StreamIndex,
position: gst::ClockTime,
- last_cue_ix: &mut OptionTracker<usize>,
- ) -> Option<String> {
+ cue: &mut Tracker<Option<String>>,
+ last_cue_ix: &mut Tracker<Option<usize>>,
+ ) {
let lock = SUBTITLE_TRACKS.read();
- let track = lock.get(&stream_ix)?;
+ let track = lock.get(&stream_ix).unwrap();
// try to find current cue quickly (should usually succeed during playback)
if let Some(ix) = last_cue_ix.get() {
- let last_cue = track.cues.get(*ix)?;
+ let last_cue = track.cues.get(*ix).unwrap();
if last_cue.start <= position && position <= last_cue.end {
- return Some(last_cue.text.clone());
- }
- let next_cue = track.cues.get(ix + 1)?;
- if last_cue.end < position && position < next_cue.start {
- return None;
- }
- if next_cue.start <= position && position <= next_cue.end {
- last_cue_ix.set(Some(ix + 1));
- return Some(next_cue.text.clone());
+ // still at current cue
+ return;
+ } else if let Some(next_cue) = track.cues.get(ix + 1) {
+ if last_cue.end < position && position < next_cue.start {
+ // strictly between cues
+ cue.set(None);
+ return;
+ }
+ if next_cue.start <= position && position <= next_cue.end {
+ // already in next cue (this happens when one cue immediately
+ // follows the previous one)
+ cue.set(Some(next_cue.text.clone()));
+ last_cue_ix.set(Some(ix + 1));
+ return;
+ }
}
}
// if we are before the first subtitle, no need to look further
- if position < track.cues.first()?.start {
+ if track.cues.is_empty() || position < track.cues.first().unwrap().start {
+ cue.set(None);
last_cue_ix.set(None);
- return None;
+ return;
}
// otherwise, search the whole track (e.g. after seeking)
- let (ix, cue) = track
+ match track
.cues
.iter()
.enumerate()
.rev()
- .find(|(_ix, cue)| cue.start <= position)?;
-
- last_cue_ix.set(Some(ix));
-
- if position <= cue.end {
- Some(cue.text.clone())
- } else {
- None
- }
+ .find(|(_ix, cue)| cue.start <= position)
+ {
+ Some((ix, new_cue)) => {
+ last_cue_ix.set(Some(ix));
+ if position <= new_cue.end {
+ cue.set(Some(new_cue.text.clone()));
+ } else {
+ cue.set(None);
+ }
+ }
+ None => {
+ cue.set(None);
+ last_cue_ix.set(None);
+ }
+ };
}
}
diff --git a/src/subtitle_extractor_aishit.rs b/src/subtitle_extractor_aishit.rs
deleted file mode 100644
index c615f6c..0000000
--- a/src/subtitle_extractor_aishit.rs
+++ /dev/null
@@ -1,732 +0,0 @@
-use std::collections::BTreeMap;
-
-use anyhow::Result;
-
-use ffmpeg::Rational;
-use log::{debug, error, info, warn};
-use relm4::{ComponentSender, SharedState, Worker};
-
-pub type StreamIndex = usize;
-
-#[derive(Debug, Clone)]
-pub struct SubtitleCue {
- pub start: gst::ClockTime,
- pub end: gst::ClockTime,
- pub text: String,
-}
-
-#[derive(Debug, Clone)]
-pub struct SubtitleTrack {
- pub language: Option<isolang::Language>,
- pub title: Option<String>,
- pub cues: Vec<SubtitleCue>,
- pub is_generated: bool, // true if generated from audio
-}
-
-pub static TRACKS: SharedState<BTreeMap<StreamIndex, SubtitleTrack>> = SharedState::new();
-
-pub struct SubtitleExtractor {}
-
-#[derive(Debug)]
-pub enum SubtitleExtractorMsg {
- ExtractFromUrl(String),
-}
-
-#[derive(Debug)]
-pub enum SubtitleExtractorOutput {
- NewOrUpdatedTrackMetadata(StreamIndex),
- NewCue(StreamIndex, SubtitleCue),
- ExtractionComplete,
-}
-
-impl Worker for SubtitleExtractor {
- type Init = ();
- type Input = SubtitleExtractorMsg;
- type Output = SubtitleExtractorOutput;
-
- fn init(_init: Self::Init, _sender: ComponentSender<Self>) -> Self {
- Self {}
- }
-
- fn update(&mut self, msg: SubtitleExtractorMsg, sender: ComponentSender<Self>) {
- match msg {
- SubtitleExtractorMsg::ExtractFromUrl(url) => {
- self.handle_extract_from_url(url, sender);
- }
- }
- }
-}
-
-impl SubtitleExtractor {
- fn handle_extract_from_url(&mut self, url: String, sender: ComponentSender<Self>) {
- // Clear existing tracks
- TRACKS.write().clear();
-
- // Try to extract subtitles using ffmpeg
- match self.extract_subtitles_ffmpeg(&url, &sender) {
- Ok(_) => {
- info!("Subtitle extraction completed successfully");
- sender
- .output(SubtitleExtractorOutput::ExtractionComplete)
- .unwrap();
- }
- Err(e) => {
- error!("FFmpeg extraction failed: {}", e);
- }
- }
- }
-
- fn extract_subtitles_ffmpeg(&self, url: &str, sender: &ComponentSender<Self>) -> Result<()> {
- info!("Starting subtitle extraction from: {}", url);
- let mut input = ffmpeg::format::input(&url)?;
-
- // Log input format info
- info!(
- "Input format: {} ({} streams)",
- input.format().name(),
- input.streams().count()
- );
-
- // Check if whisper filter is available
- if let Some(whisper_filter) = ffmpeg::filter::find("whisper") {
- info!("Whisper filter found: {}", whisper_filter.name());
- } else {
- warn!("Whisper filter not found - audio transcription will be skipped");
- }
-
- let mut subtitle_decoders = BTreeMap::new();
- let mut audio_decoder: Option<ffmpeg::decoder::Audio> = None;
- let mut _whisper_filter_graph: Option<ffmpeg::filter::Graph> = None;
- let mut whisper_source: Option<ffmpeg::filter::Context> = None;
- let mut whisper_sink: Option<ffmpeg::filter::Context> = None;
- let mut best_audio_stream_index: Option<usize> = None;
-
- // Find best audio stream for whisper processing
- if let Some(audio_stream) = input.streams().best(ffmpeg::media::Type::Audio) {
- best_audio_stream_index = Some(audio_stream.index());
-
- // Get audio parameters safely
- let codec_id = audio_stream.parameters().id();
- let channels = if let Ok(context) =
- ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters())
- {
- if let Ok(audio) = context.decoder().audio() {
- audio.channels()
- } else {
- 0
- }
- } else {
- 0
- };
- let sample_rate = if let Ok(context) =
- ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters())
- {
- if let Ok(audio) = context.decoder().audio() {
- audio.rate()
- } else {
- 0
- }
- } else {
- 0
- };
-
- info!(
- "Found best audio stream: index {} (codec: {:?}, channels: {}, sample_rate: {})",
- audio_stream.index(),
- codec_id,
- channels,
- sample_rate
- );
- } else {
- info!("No audio stream found for whisper processing");
- }
-
- // Set up whisper filter graph if we found an audio stream
- if let Some(audio_index) = best_audio_stream_index {
- info!("Setting up whisper filter for audio stream {}", audio_index);
-
- let audio_stream = input.stream(audio_index).unwrap();
- if let Ok(context) =
- ffmpeg::codec::context::Context::from_parameters(audio_stream.parameters())
- {
- if let Ok(decoder) = context.decoder().audio() {
- // Get decoder properties before moving it
- let decoder_rate = decoder.rate();
- let decoder_format = decoder.format();
- let decoder_channel_layout = decoder.channel_layout().bits();
-
- audio_decoder = Some(decoder);
-
- // Set up whisper filter graph
- debug!("Creating whisper filter graph...");
- debug!(
- "Audio stream time_base: {}, decoder rate: {}, format: {:?}, channel_layout: 0x{:x}",
- audio_stream.time_base(),
- decoder_rate,
- decoder_format,
- decoder_channel_layout
- );
- match self.setup_whisper_filter(&audio_stream) {
- Ok((graph, source, sink)) => {
- info!("Whisper filter graph created successfully");
- _whisper_filter_graph = Some(graph);
- whisper_source = Some(source);
- whisper_sink = Some(sink);
- debug!("Whisper source and sink contexts stored");
-
- // Create a generated subtitle track
- let track = SubtitleTrack {
- language: Some(isolang::Language::from_639_1("en").unwrap_or_else(
- || isolang::Language::from_639_3("eng").unwrap(),
- )),
- title: Some("Generated from Audio (Whisper)".to_string()),
- cues: Vec::new(),
- is_generated: true,
- };
-
- let whisper_stream_index = 1000; // Use high index for generated tracks
- TRACKS.write().insert(whisper_stream_index, track);
-
- sender
- .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata(
- whisper_stream_index,
- ))
- .unwrap();
- }
- Err(e) => {
- error!("Failed to setup whisper filter: {}", e);
- debug!("Whisper filter error details: {:?}", e);
- warn!(
- "Audio transcription will be skipped due to filter setup failure"
- );
- }
- }
- }
- }
- }
-
- // Create decoder for each subtitle stream
- for (stream_index, stream) in input.streams().enumerate() {
- if stream.parameters().medium() == ffmpeg::media::Type::Subtitle {
- let language_code = stream.metadata().get("language").map(|s| s.to_string());
- let title = stream.metadata().get("title").map(|s| s.to_string());
-
- let track = SubtitleTrack {
- language: language_code.and_then(|code| isolang::Language::from_639_2b(&code)),
- title,
- cues: Vec::new(),
- is_generated: false,
- };
-
- TRACKS.write().insert(stream_index, track);
-
- sender
- .output(SubtitleExtractorOutput::NewOrUpdatedTrackMetadata(
- stream_index,
- ))
- .unwrap();
-
- let context =
- ffmpeg::codec::context::Context::from_parameters(stream.parameters())?;
- if let Ok(decoder) = context.decoder().subtitle() {
- subtitle_decoders.insert(stream_index, decoder);
- debug!("Created decoder for subtitle stream {}", stream_index);
- } else {
- error!(
- "Failed to create decoder for subtitle stream {}",
- stream_index
- );
- }
- } else {
- debug!(
- "Failed to create context for subtitle stream {}",
- stream_index
- );
- }
- }
-
- // Process packets
- for (stream, packet) in input.packets() {
- let stream_index = stream.index();
-
- // Process subtitle packets
- if let Some(decoder) = subtitle_decoders.get_mut(&stream_index) {
- let mut subtitle = ffmpeg::Subtitle::new();
- if decoder.decode(&packet, &mut subtitle).is_ok() {
- if let Some(cue) = Self::subtitle_to_cue(&subtitle, &packet, stream.time_base())
- {
- if let Some(track) = TRACKS.write().get_mut(&stream_index) {
- track.cues.push(cue.clone());
- }
-
- sender
- .output(SubtitleExtractorOutput::NewCue(stream_index, cue))
- .unwrap();
- }
- }
- }
-
- // Process audio packets for whisper
- if Some(stream_index) == best_audio_stream_index {
- debug!(
- "Processing audio packet for whisper (stream: {}, pts: {:?}, duration: {:?})",
- stream_index,
- packet.pts(),
- packet.duration()
- );
- debug!(
- "Audio decoder available: {}, Whisper source available: {}",
- audio_decoder.is_some(),
- whisper_source.is_some()
- );
- if let (Some(decoder), Some(source)) = (&mut audio_decoder, &mut whisper_source) {
- debug!("Both audio decoder and whisper source are available, processing...");
- // Send packet to audio decoder
- if let Err(e) = decoder.send_packet(&packet) {
- debug!("Failed to send packet to audio decoder: {}", e);
- }
-
- // Get decoded frames and send to whisper filter
- let mut frame = unsafe { ffmpeg::Frame::empty() };
- let mut frame_count = 0;
- while decoder.receive_frame(&mut frame).is_ok() {
- frame_count += 1;
- debug!(
- "Decoded audio frame {} (pts: {:?})",
- frame_count,
- frame.pts()
- );
-
- // Add frame to whisper filter
- if let Err(e) = source.source().add(&frame) {
- error!("Failed to add frame to whisper filter: {}", e);
- } else {
- debug!("Successfully added frame to whisper filter");
- }
-
- // Check for whisper output after adding each frame
- if let Some(sink) = &mut whisper_sink {
- self.check_whisper_output(sink, sender)?;
- }
- }
- if frame_count > 0 {
- debug!("Processed {} audio frames for whisper", frame_count);
- }
- } else {
- debug!("Skipping audio packet - decoder or whisper source not available");
- }
- }
- }
-
- // Flush audio decoder and whisper filter
- if let (Some(decoder), Some(source), Some(sink)) =
- (&mut audio_decoder, &mut whisper_source, &mut whisper_sink)
- {
- info!("Flushing audio decoder and whisper filter...");
- // Flush decoder
- if let Err(e) = decoder.send_eof() {
- debug!("Failed to send EOF to decoder: {}", e);
- }
- let mut frame = unsafe { ffmpeg::Frame::empty() };
- let mut final_frame_count = 0;
- while decoder.receive_frame(&mut frame).is_ok() {
- final_frame_count += 1;
- source.source().add(&frame).ok();
- }
- debug!("Flushed {} final frames from decoder", final_frame_count);
-
- // Flush filter and get results
- debug!("Flushing whisper filter...");
- if let Err(e) = source.source().flush() {
- error!("Failed to flush whisper filter: {}", e);
- }
-
- info!("Processing final whisper filter output...");
- self.check_whisper_output(sink, sender)?;
- }
-
- Ok(())
- }
-
- fn setup_whisper_filter(
- &self,
- audio_stream: &ffmpeg::Stream,
- ) -> Result<(
- ffmpeg::filter::Graph,
- ffmpeg::filter::Context,
- ffmpeg::filter::Context,
- )> {
- debug!("Setting up whisper filter graph...");
- let mut filter_graph = ffmpeg::filter::Graph::new();
- debug!("Filter graph created successfully");
-
- // Get audio parameters
- debug!("Getting audio parameters...");
- let time_base = audio_stream.time_base();
- let audio_params = audio_stream.parameters();
- debug!("Creating context from parameters...");
- let context = ffmpeg::codec::context::Context::from_parameters(audio_params)?;
- debug!("Getting audio decoder from context...");
- let audio_decoder = context.decoder().audio()?;
- debug!("Audio decoder created successfully");
-
- // Create buffer source
- let buffer_args = format!(
- "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
- time_base,
- audio_decoder.rate(),
- audio_decoder.format().name(),
- audio_decoder.channel_layout().bits()
- );
- debug!("Buffer args: {}", buffer_args);
-
- debug!("Looking for abuffer filter...");
- let abuffer_filter = ffmpeg::filter::find("abuffer")
- .ok_or_else(|| anyhow::anyhow!("abuffer filter not found"))?;
- debug!("abuffer filter found: {}", abuffer_filter.name());
-
- debug!("Adding abuffer filter...");
- match filter_graph.add(&abuffer_filter, "src", &buffer_args) {
- Ok(_) => debug!("abuffer filter added successfully"),
- Err(e) => {
- error!("Failed to add abuffer filter: {}", e);
- return Err(anyhow::anyhow!("Failed to add abuffer filter: {}", e));
- }
- }
-
- // Create whisper filter with parameters
- // Try absolute path and different parameter formats
- let model_path = std::path::Path::new("./whisper-models/ggml-large-v3.bin");
- let absolute_path = if model_path.exists() {
- model_path
- .canonicalize()
- .map(|p| p.to_string_lossy().to_string())
- .unwrap_or_else(|_| "./whisper-models/ggml-large-v3.bin".to_string())
- } else {
- warn!("Whisper model file not found at: {:?}", model_path);
- "./whisper-models/ggml-large-v3.bin".to_string()
- };
-
- debug!("Model path exists: {}", model_path.exists());
- debug!("Using absolute path: {}", absolute_path);
-
- debug!("Looking for whisper filter...");
- let whisper_filter = ffmpeg::filter::find("whisper").ok_or_else(|| {
- error!("Whisper filter not found! Make sure FFmpeg was compiled with whisper support");
- anyhow::anyhow!("Whisper filter not available")
- })?;
-
- debug!("Whisper filter found: {}", whisper_filter.name());
- // We'll create the whisper filter through the parse method instead of adding it manually
-
- // Create audio buffer sink for whisper output (whisper outputs audio + metadata)
- debug!("Looking for abuffersink filter for audio output...");
- let abuffersink_filter = ffmpeg::filter::find("abuffersink")
- .ok_or_else(|| anyhow::anyhow!("abuffersink filter not found"))?;
- debug!("abuffersink filter found: {}", abuffersink_filter.name());
-
- debug!("Adding abuffersink filter...");
- match filter_graph.add(&abuffersink_filter, "sink", "") {
- Ok(_) => debug!("abuffersink filter added successfully"),
- Err(e) => {
- error!("Failed to add abuffersink filter: {}", e);
- return Err(anyhow::anyhow!("Failed to add abuffersink filter: {}", e));
- }
- }
-
- // Connect filters using the complete filter chain description
- debug!("Connecting filter graph with complete chain: src -> whisper -> sink");
-
- let filter_chain = format!(
- "[src]whisper=model={}:queue=30:format=json[sink]",
- "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin",
- //"/Users/malte/repos/lleap/whisper-models/ggml-silero-v5.1.2.bin"
- );
- debug!("Using filter chain: {}", filter_chain);
-
- if let Err(e) = filter_graph
- .output("src", 0)
- .and_then(|o| o.input("sink", 0))
- .and_then(|i| i.parse(&filter_chain))
- {
- error!("Failed to connect filter graph: {}", e);
- return Err(anyhow::anyhow!("Failed to connect filter graph: {}", e));
- }
- debug!("Filter graph connected successfully");
-
- // Validate filter graph
- debug!("Validating filter graph...");
- match filter_graph.validate() {
- Ok(_) => {
- info!("Filter graph validated successfully");
- debug!("Filter graph dump:\n{}", filter_graph.dump());
- }
- Err(e) => {
- error!("Filter graph validation failed: {}", e);
- debug!(
- "Filter graph dump before validation failure:\n{}",
- filter_graph.dump()
- );
- return Err(anyhow::anyhow!("Filter graph validation failed: {}", e));
- }
- }
-
- debug!("Getting final source and sink contexts...");
- let source_ctx = filter_graph
- .get("src")
- .ok_or_else(|| anyhow::anyhow!("Source context not found"))?;
- let sink_ctx = filter_graph
- .get("sink")
- .ok_or_else(|| anyhow::anyhow!("Sink context not found"))?;
- debug!("Final contexts retrieved successfully");
-
- Ok((filter_graph, source_ctx, sink_ctx))
- }
-
- fn check_whisper_output(
- &self,
- sink: &mut ffmpeg::filter::Context,
- sender: &ComponentSender<Self>,
- ) -> Result<()> {
- debug!("Attempting to read audio frames from whisper filter output...");
-
- // The whisper filter outputs audio frames with subtitle data in "lavfi.whisper.text" metadata
- let mut frame = unsafe { ffmpeg::Frame::empty() };
- let mut output_count = 0;
-
- while sink.sink().frame(&mut frame).is_ok() {
- output_count += 1;
- debug!(
- "Received audio frame {} from whisper filter (pts: {:?})",
- output_count,
- frame.pts()
- );
-
- // Look specifically for lavfi.whisper.text metadata
- if let Some(whisper_text) = frame.metadata().get("lavfi.whisper.text") {
- info!("Found whisper transcription: {}", whisper_text);
-
- let start_time = if let Some(pts) = frame.pts() {
- // Convert PTS to nanoseconds based on whisper filter's time base (16kHz)
- gst::ClockTime::from_nseconds((pts as u64 * 1_000_000_000) / 16000)
- } else {
- gst::ClockTime::ZERO
- };
-
- // Log all available metadata keys to help debug
- let metadata_entries: Vec<(String, String)> = frame
- .metadata()
- .iter()
- .map(|(k, v)| (k.to_string(), v.to_string()))
- .collect();
- if !metadata_entries.is_empty() {
- let metadata_keys: Vec<String> =
- metadata_entries.iter().map(|(k, _)| k.clone()).collect();
- debug!("Frame metadata keys: {:?}", metadata_keys);
- }
-
- // Parse the whisper text (might be JSON format)
- self.parse_whisper_text(whisper_text, start_time, sender)?;
- }
- }
-
- if output_count > 0 {
- info!("Processed {} frames from whisper filter", output_count);
- } else {
- debug!("No frames available from whisper filter");
- }
-
- Ok(())
- }
-
- fn parse_whisper_text(
- &self,
- whisper_text: &str,
- base_time: gst::ClockTime,
- sender: &ComponentSender<Self>,
- ) -> Result<()> {
- debug!("Parsing whisper text: {}", whisper_text);
-
- // The whisper text might be in different formats depending on the filter configuration
- // For now, treat it as plain text and create a single cue
- let cue = SubtitleCue {
- start: base_time,
- end: base_time + gst::ClockTime::from_seconds(3), // Default 3 second duration
- text: whisper_text.to_string(),
- };
-
- let whisper_stream_index = 1000;
- if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) {
- track.cues.push(cue.clone());
- }
-
- sender
- .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue))
- .unwrap();
-
- Ok(())
- }
-
- fn parse_whisper_subtitle_data(
- &self,
- subtitle_data: &str,
- sender: &ComponentSender<Self>,
- ) -> Result<()> {
- // Parse SRT-format output from whisper
- info!(
- "Parsing whisper subtitle data ({} characters)",
- subtitle_data.len()
- );
- debug!("Subtitle data content:\n{}", subtitle_data);
- let lines: Vec<&str> = subtitle_data.lines().collect();
- let mut i = 0;
-
- while i < lines.len() {
- // Skip subtitle number
- if lines[i].trim().parse::<i32>().is_ok() {
- i += 1;
- }
-
- // Parse timestamp line
- if i < lines.len() {
- if let Some((start, end)) = self.parse_srt_timestamp(lines[i]) {
- i += 1;
-
- // Collect text lines
- let mut text_lines = Vec::new();
- while i < lines.len() && !lines[i].trim().is_empty() {
- text_lines.push(lines[i].to_string());
- i += 1;
- }
-
- if !text_lines.is_empty() {
- let cue = SubtitleCue {
- start,
- end,
- text: text_lines.join("\n"),
- };
-
- let whisper_stream_index = 1000;
- if let Some(track) = TRACKS.write().get_mut(&whisper_stream_index) {
- track.cues.push(cue.clone());
- }
-
- sender
- .output(SubtitleExtractorOutput::NewCue(whisper_stream_index, cue))
- .unwrap();
- }
- }
- }
- i += 1;
- }
-
- Ok(())
- }
-
- fn parse_srt_timestamp(&self, line: &str) -> Option<(gst::ClockTime, gst::ClockTime)> {
- // Parse SRT timestamp format: "00:00:01,234 --> 00:00:05,678"
- let parts: Vec<&str> = line.split(" --> ").collect();
- if parts.len() != 2 {
- return None;
- }
-
- let start = self.parse_srt_time(parts[0])?;
- let end = self.parse_srt_time(parts[1])?;
-
- Some((start, end))
- }
-
- fn parse_srt_time(&self, time_str: &str) -> Option<gst::ClockTime> {
- // Parse SRT time format: "00:00:01,234"
- let parts: Vec<&str> = time_str.split(',').collect();
- if parts.len() != 2 {
- return None;
- }
-
- let time_part = parts[0];
- let millis: u32 = parts[1].parse().ok()?;
-
- let time_components: Vec<&str> = time_part.split(':').collect();
- if time_components.len() != 3 {
- return None;
- }
-
- let hours: u32 = time_components[0].parse().ok()?;
- let minutes: u32 = time_components[1].parse().ok()?;
- let seconds: u32 = time_components[2].parse().ok()?;
-
- let total_millis = hours * 3600000 + minutes * 60000 + seconds * 1000 + millis;
- let nanoseconds = total_millis as u64 * 1_000_000;
-
- Some(gst::ClockTime::from_nseconds(nanoseconds))
- }
-
- fn subtitle_to_cue(
- subtitle: &ffmpeg::Subtitle,
- packet: &ffmpeg::Packet,
- time_base: Rational,
- ) -> Option<SubtitleCue> {
- let time_to_clock_time = |time: i64| {
- let nseconds: i64 = (time * time_base.numerator() as i64 * 1_000_000_000)
- / time_base.denominator() as i64;
- gst::ClockTime::from_nseconds(nseconds as u64)
- };
-
- let text = subtitle
- .rects()
- .into_iter()
- .map(|rect| match rect {
- ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
- ffmpeg::subtitle::Rect::Ass(ass) => {
- Self::extract_dialogue_text(ass.get()).unwrap_or(String::new())
- }
- _ => String::new(),
- })
- .collect::<Vec<String>>()
- .join("\n— ");
-
- let start = time_to_clock_time(packet.pts()?);
- let end = time_to_clock_time(packet.pts()? + packet.duration());
-
- Some(SubtitleCue { start, end, text })
- }
-
- fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
- // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
- // we need the 9th field (Text), so split on comma but only take first 9 splits
- // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
- let text = dialogue_line.splitn(9, ',').last()?;
-
- // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
- let mut result = String::new();
- let mut in_tag = false;
- let mut char_iter = text.chars().peekable();
-
- while let Some(c) = char_iter.next() {
- if c == '{' && char_iter.peek() == Some(&'\\') {
- in_tag = true;
- } else if c == '}' {
- in_tag = false;
- } else if !in_tag {
- // process line breaks and hard spaces
- if c == '\\' {
- match char_iter.peek() {
- Some(&'N') => {
- char_iter.next();
- result.push('\n');
- }
- Some(&'n') | Some(&'h') => {
- char_iter.next();
- result.push(' ');
- }
- _ => result.push(c),
- }
- } else {
- result.push(c);
- }
- }
- }
-
- Some(result)
- }
-}