aboutsummaryrefslogtreecommitdiff
path: root/src/subtitle_extraction
diff options
context:
space:
mode:
authorMalte Voos <git@mal.tc>2025-11-15 01:05:24 +0100
committerMalte Voos <git@mal.tc>2025-11-15 02:14:19 +0100
commit016b76acba13e86df59f818581aa61f7bbaffff8 (patch)
treee88417289428a947d818d3eeaeafbd2df34b4efb /src/subtitle_extraction
parenta8457a25ccb9b1ef47f5ce9d7ac1a84c47600c9e (diff)
downloadlleap-016b76acba13e86df59f818581aa61f7bbaffff8.tar.gz
lleap-016b76acba13e86df59f818581aa61f7bbaffff8.zip
whisper transcription working
Diffstat (limited to 'src/subtitle_extraction')
-rw-r--r--src/subtitle_extraction/embedded.rs8
-rw-r--r--src/subtitle_extraction/whisper.rs100
2 files changed, 88 insertions, 20 deletions
diff --git a/src/subtitle_extraction/embedded.rs b/src/subtitle_extraction/embedded.rs
index 5cdf813..0ba6178 100644
--- a/src/subtitle_extraction/embedded.rs
+++ b/src/subtitle_extraction/embedded.rs
@@ -53,9 +53,9 @@ fn parse_subtitle(
packet: &ffmpeg::Packet,
time_base: Rational,
) -> Option<SubtitleCue> {
- let time_to_clock_time = |time: i64| {
+ let pts_to_clock_time = |pts: i64| {
let nseconds: i64 =
- (time * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
+ (pts * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
gst::ClockTime::from_nseconds(nseconds as u64)
};
@@ -72,8 +72,8 @@ fn parse_subtitle(
.collect::<Vec<String>>()
.join("\n— ");
- let start = time_to_clock_time(packet.pts()?);
- let end = time_to_clock_time(packet.pts()? + packet.duration());
+ let start = pts_to_clock_time(packet.pts()?);
+ let end = pts_to_clock_time(packet.pts()? + packet.duration());
Some(SubtitleCue { start, end, text })
}
diff --git a/src/subtitle_extraction/whisper.rs b/src/subtitle_extraction/whisper.rs
index 5622d6f..ffa2e47 100644
--- a/src/subtitle_extraction/whisper.rs
+++ b/src/subtitle_extraction/whisper.rs
@@ -1,10 +1,22 @@
-use std::sync::mpsc;
+use std::{
+ io::{self, BufRead, BufReader},
+ net::{TcpListener, TcpStream},
+ sync::mpsc,
+};
use anyhow::Context;
-use ffmpeg::filter;
+use ffmpeg::{filter, frame};
+use serde::Deserialize;
use crate::{subtitle_extraction::*, tracks::StreamIndex};
+#[derive(Debug, Deserialize)]
+struct WhisperCue {
+ start: u64,
+ end: u64,
+ text: String,
+}
+
pub fn generate_whisper_subtitles(
// stream index to use when storing generated subtitles, this index
// already has to be in TRACKS when this function is called!
@@ -14,6 +26,13 @@ pub fn generate_whisper_subtitles(
packet_rx: mpsc::Receiver<ffmpeg::Packet>,
sender: ComponentSender<SubtitleExtractor>,
) -> anyhow::Result<()> {
+ // FFmpeg's whisper filter will send the generated subtitles to us as JSON
+ // objects over a TCP socket. This is the best solution I could find
+ // because we need to use one of the protocols in
+ // https://ffmpeg.org/ffmpeg-protocols.html, and TCP is the only one on the
+ // list which is portable and supports non-blocking IO in Rust.
+ let tcp_listener = TcpListener::bind("127.0.0.1:0")?;
+
let mut decoder = context
.decoder()
.audio()
@@ -23,14 +42,17 @@ pub fn generate_whisper_subtitles(
let abuffer_args = format!(
"time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
- decoder.time_base(),
+ time_base,
decoder.rate(),
decoder.format().name(),
decoder.channel_layout().bits()
);
+
let whisper_args = format!(
- "model={}:queue={}:format=json",
- "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", 30
+ "model={}:queue={}:destination=tcp\\\\://127.0.0.1\\\\:{}:format=json",
+ "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin",
+ 30,
+ tcp_listener.local_addr()?.port()
);
let filter_spec = format!("[src] whisper={} [sink]", whisper_args);
@@ -45,31 +67,77 @@ pub fn generate_whisper_subtitles(
let mut source_ctx = filter.get("src").unwrap();
let mut sink_ctx = filter.get("sink").unwrap();
+ let (tcp_stream, _) = tcp_listener.accept()?;
+ tcp_stream.set_nonblocking(true)?;
+
+ let mut transcript_reader = BufReader::new(tcp_stream);
+ let mut line_buf = String::new();
+
while let Ok(packet) = packet_rx.recv() {
- handle_packet(&mut decoder, source_ctx.source(), sink_ctx.sink(), packet)
- .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
+ handle_packet(
+ stream_ix,
+ &sender,
+ &mut decoder,
+ source_ctx.source(),
+ sink_ctx.sink(),
+ &mut transcript_reader,
+ &mut line_buf,
+ packet,
+ )
+ .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
}
Ok(())
}
+// TODO: can we do this without passing all the arguments? this is kinda ugly
fn handle_packet(
+ stream_ix: StreamIndex,
+ sender: &ComponentSender<SubtitleExtractor>,
decoder: &mut ffmpeg::decoder::Audio,
mut source: filter::Source,
mut sink: filter::Sink,
+ transcript_reader: &mut BufReader<TcpStream>,
+ line_buf: &mut String,
packet: ffmpeg::Packet,
) -> anyhow::Result<()> {
- let mut in_frame = unsafe { ffmpeg::Frame::empty() };
decoder.send_packet(&packet)?;
- decoder.receive_frame(&mut in_frame)?;
- source.add(&in_frame)?;
- let mut out_frame = unsafe { ffmpeg::Frame::empty() };
- sink.frame(&mut out_frame)?;
-
- if let Some(text) = out_frame.metadata().get("lavfi.whisper.text") {
- println!("{}", text);
+ let mut decoded = frame::Audio::empty();
+ while decoder.receive_frame(&mut decoded).is_ok() {
+ source.add(&decoded)?;
}
- Ok(())
+ let mut out_frame = frame::Audio::empty();
+ while sink.frame(&mut out_frame).is_ok() {}
+
+ line_buf.clear();
+ match transcript_reader.read_line(line_buf) {
+ Ok(_) => {
+ let whisper_cue: WhisperCue = serde_json::from_str(&line_buf)?;
+
+ let cue = SubtitleCue {
+ start: gst::ClockTime::from_mseconds(whisper_cue.start),
+ end: gst::ClockTime::from_mseconds(whisper_cue.end),
+ text: whisper_cue.text,
+ };
+
+ // TODO deduplicate this vs. the code in embedded.rs
+ SUBTITLE_TRACKS
+ .write()
+ .get_mut(&stream_ix)
+ .unwrap()
+ .cues
+ .push(cue.clone());
+ sender
+ .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
+ .unwrap();
+
+ Ok(())
+ }
+ Err(e) => match e.kind() {
+ io::ErrorKind::WouldBlock => Ok(()),
+ _ => Err(e)?,
+ },
+ }
}