summary refs log tree commit diff
path: root/src/subtitle_extraction/whisper.rs
blob: 5622d6f43ef54f8936e041d5ec20b8ebbdee6fe0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
use std::sync::mpsc;

use anyhow::Context;
use ffmpeg::filter;

use crate::{subtitle_extraction::*, tracks::StreamIndex};

pub fn generate_whisper_subtitles(
    // stream index to use when storing generated subtitles, this index
    // already has to be in TRACKS when this function is called!
    stream_ix: StreamIndex,
    context: ffmpeg::codec::Context,
    time_base: ffmpeg::Rational,
    packet_rx: mpsc::Receiver<ffmpeg::Packet>,
    sender: ComponentSender<SubtitleExtractor>,
) -> anyhow::Result<()> {
    let mut decoder = context
        .decoder()
        .audio()
        .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;

    let mut filter = filter::Graph::new();

    let abuffer_args = format!(
        "time_base={}:sample_rate={}:sample_fmt={}:channel_layout=0x{:x}",
        decoder.time_base(),
        decoder.rate(),
        decoder.format().name(),
        decoder.channel_layout().bits()
    );
    let whisper_args = format!(
        "model={}:queue={}:format=json",
        "/Users/malte/repos/lleap/whisper-models/ggml-large-v3.bin", 30
    );
    let filter_spec = format!("[src] whisper={} [sink]", whisper_args);

    filter.add(&filter::find("abuffer").unwrap(), "src", &abuffer_args)?;
    filter.add(&filter::find("abuffersink").unwrap(), "sink", "")?;
    filter
        .output("src", 0)?
        .input("sink", 0)?
        .parse(&filter_spec)?;
    filter.validate()?;

    let mut source_ctx = filter.get("src").unwrap();
    let mut sink_ctx = filter.get("sink").unwrap();

    while let Ok(packet) = packet_rx.recv() {
        handle_packet(&mut decoder, source_ctx.source(), sink_ctx.sink(), packet)
            .unwrap_or_else(|e| log::error!("error handling audio packet: {}", e))
    }

    Ok(())
}

fn handle_packet(
    decoder: &mut ffmpeg::decoder::Audio,
    mut source: filter::Source,
    mut sink: filter::Sink,
    packet: ffmpeg::Packet,
) -> anyhow::Result<()> {
    let mut in_frame = unsafe { ffmpeg::Frame::empty() };
    decoder.send_packet(&packet)?;
    decoder.receive_frame(&mut in_frame)?;
    source.add(&in_frame)?;

    let mut out_frame = unsafe { ffmpeg::Frame::empty() };
    sink.frame(&mut out_frame)?;

    if let Some(text) = out_frame.metadata().get("lavfi.whisper.text") {
        println!("{}", text);
    }

    Ok(())
}