diff options
| author | Malte Voos <git@mal.tc> | 2025-11-15 01:05:24 +0100 |
|---|---|---|
| committer | Malte Voos <git@mal.tc> | 2025-11-15 02:14:19 +0100 |
| commit | 016b76acba13e86df59f818581aa61f7bbaffff8 (patch) | |
| tree | e88417289428a947d818d3eeaeafbd2df34b4efb /src | |
| parent | a8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e (diff) | |
| download | lleap-016b76acba13e86df59f818581aa61f7bbaffff8.tar.gz lleap-016b76acba13e86df59f818581aa61f7bbaffff8.zip | |
whisper transcription working
Diffstat (limited to 'src')
| -rw-r--r-- | src/subtitle_extraction/embedded.rs | 8 | ||||
| -rw-r--r-- | src/subtitle_extraction/whisper.rs | 100 |
2 files changed, 88 insertions, 20 deletions
diff --git a/src/subtitle_extraction/embedded.rs b/src/subtitle_extraction/embedded.rs index 5cdf813..0ba6178 100644 --- a/src/subtitle_extraction/embedded.rs +++ b/src/subtitle_extraction/embedded.rs @@ -53,9 +53,9 @@ fn parse_subtitle( packet: &ffmpeg::Packet, time_base: Rational, ) -> Option<SubtitleCue> { - let time_to_clock_time = |time: i64| { + let pts_to_clock_time = |pts: i64| { let nseconds: i64 = - (time * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64; + (pts * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64; gst::ClockTime::from_nseconds(nseconds as u64) }; @@ -72,8 +72,8 @@ fn parse_subtitle( .collect::<Vec<String>>() .join("\n— "); - let start = time_to_clock_time(packet.pts()?); - let end = time_to_clock_time(packet.pts()? + packet.duration()); + let start = pts_to_clock_time(packet.pts()?); + let end = pts_to_clock_time(packet.pts()? + packet.duration()); Some(SubtitleCue { start, end, text }) } diff --git a/src/subtitle_extraction/whisper.rs b/src/subtitle_extraction/whisper.rs index 5622d6f..ffa2e47 100644 --- a/src/subtitle_extraction/whisper.rs +++ b/src/subtitle_extraction/whisper.rs @@ -1,10 +1,22 @@ -use std::sync::mpsc; +use std::{ + io::{self, BufRead, BufReader}, + net::{TcpListener, TcpStream}, + sync::mpsc, +}; use anyhow::Context; -use ffmpeg::filter; +use ffmpeg::{filter, frame}; +use serde::Deserialize; use crate::{subtitle_extraction::*, tracks::StreamIndex}; +#[derive(Debug, Deserialize)] +struct WhisperCue { + start: u64, + end: u64, + text: String, +} + pub fn generate_whisper_subtitles( // stream index to use when storing generated subtitles, this index // already has to be in TRACKS when this function is called! @@ -14,6 +26,13 @@ pub fn generate_whisper_subtitles( packet_rx: mpsc::Receiver<ffmpeg::Packet>, sender: ComponentSender<SubtitleExtractor>, ) -> anyhow::Result<()> { + // FFmpeg's whisper filter will send the generated subtitles to us as JSON + // objects over a TCP socket. This is the best solution I could find + // because we need to use one of the protocols in + // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the + // list which is portable and supports non-blocking IO in Rust. + let tcp_listener = TcpListener::bind("127.0.0.1:0")?; + let mut decoder = context .decoder() .audio() @@ -23,14 +42,17 @@ pub fn generate_whisper_subtitles( let abuffer_args = format!( "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}", - decoder.time_base(), + time_base, decoder.rate(), decoder.format().name(), decoder.channel_layout().bits() ); + let whisper_args = format!( - "model={}:queue={}:format=json", - "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", 30 + "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json", + "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", + 30, + tcp_listener.local_addr()?.port() ); let filter_spec = format!("[src] whisper={} [sink]", whisper_args); @@ -45,31 +67,77 @@ pub fn generate_whisper_subtitles( let mut source_ctx = filter.get("src").unwrap(); let mut sink_ctx = filter.get("sink").unwrap(); + let (tcp_stream, _) = tcp_listener.accept()?; + tcp_stream.set_nonblocking(true)?; + + let mut transcript_reader = BufReader::new(tcp_stream); + let mut line_buf = String::new(); + while let Ok(packet) = packet_rx.recv() { - handle_packet(&mut decoder, source_ctx.source(), sink_ctx.sink(), packet) - .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e)) + handle_packet( + stream_ix, + &sender, + &mut decoder, + source_ctx.source(), + sink_ctx.sink(), + &mut transcript_reader, + &mut line_buf, + packet, + ) + .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e)) } Ok(()) } +// TODO: can we do this without passing all the arguments? this is kinda ugly fn handle_packet( + stream_ix: StreamIndex, + sender: &ComponentSender<SubtitleExtractor>, decoder: &mut ffmpeg::decoder::Audio, mut source: filter::Source, mut sink: filter::Sink, + transcript_reader: &mut BufReader<TcpStream>, + line_buf: &mut String, packet: ffmpeg::Packet, ) -> anyhow::Result<()> { - let mut in_frame = unsafe { ffmpeg::Frame::empty() }; decoder.send_packet(&packet)?; - decoder.receive_frame(&mut in_frame)?; - source.add(&in_frame)?; - let mut out_frame = unsafe { ffmpeg::Frame::empty() }; - sink.frame(&mut out_frame)?; - - if let Some(text) = out_frame.metadata().get("lavfi.whisper.text") { - println!("{}", text); + let mut decoded = frame::Audio::empty(); + while decoder.receive_frame(&mut decoded).is_ok() { + source.add(&decoded)?; } - Ok(()) + let mut out_frame = frame::Audio::empty(); + while sink.frame(&mut out_frame).is_ok() {} + + line_buf.clear(); + match transcript_reader.read_line(line_buf) { + Ok(_) => { + let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?; + + let cue = SubtitleCue { + start: gst::ClockTime::from_mseconds(whisper_cue.start), + end: gst::ClockTime::from_mseconds(whisper_cue.end), + text: whisper_cue.text, + }; + + // TODO deduplicate this vs. the code in embedded.rs + SUBTITLE_TRACKS + .write() + .get_mut(&stream_ix) + .unwrap() + .cues + .push(cue.clone()); + sender + .output(SubtitleExtractorOutput::NewCue(stream_ix, cue)) + .unwrap(); + + Ok(()) + } + Err(e) => match e.kind() { + io::ErrorKind::WouldBlock => Ok(()), + _ => Err(e)?, + }, + } } |