summary refs log tree commit diff
path: root/src/subtitle_extraction/embedded.rs
blob: 5cdf8132cd80d2b703187889a02d3f61ae3edee0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
use std::sync::mpsc;

use anyhow::Context;

use crate::subtitle_extraction::*;

pub fn extract_embedded_subtitles(
    // stream index to use when storing extracted subtitles, this index already
    // has to be in TRACKS when this function is called!
    stream_ix: StreamIndex,
    context: ffmpeg::codec::Context,
    time_base: ffmpeg::Rational,
    packet_rx: mpsc::Receiver<ffmpeg::Packet>,
    sender: ComponentSender<SubtitleExtractor>,
) -> anyhow::Result<()> {
    let mut decoder = context
        .decoder()
        .subtitle()
        .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;

    while let Ok(packet) = packet_rx.recv() {
        let mut subtitle = ffmpeg::Subtitle::new();
        match decoder.decode(&packet, &mut subtitle) {
            Ok(true) => {
                if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) {
                    SUBTITLE_TRACKS
                        .write()
                        .get_mut(&stream_ix)
                        .unwrap()
                        .cues
                        .push(cue.clone());
                    sender
                        .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
                        .unwrap();
                } else {
                    log::error!("error parsing subtitle at pts {:?}", packet.pts())
                }
            }
            Ok(false) => {
                log::debug!("got empty (?) subtitle, not sure if this should ever happen");
            }
            Err(e) => {
                log::error!("error decoding subtitle: {:?}", e)
            }
        }
    }

    Ok(())
}

fn parse_subtitle(
    subtitle: &ffmpeg::Subtitle,
    packet: &ffmpeg::Packet,
    time_base: Rational,
) -> Option<SubtitleCue> {
    let time_to_clock_time = |time: i64| {
        let nseconds: i64 =
            (time * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
        gst::ClockTime::from_nseconds(nseconds as u64)
    };

    let text = subtitle
        .rects()
        .into_iter()
        .map(|rect| match rect {
            ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
            ffmpeg::subtitle::Rect::Ass(ass) => {
                extract_dialogue_text(ass.get()).unwrap_or(String::new())
            }
            _ => String::new(),
        })
        .collect::<Vec<String>>()
        .join("\n— ");

    let start = time_to_clock_time(packet.pts()?);
    let end = time_to_clock_time(packet.pts()? + packet.duration());

    Some(SubtitleCue { start, end, text })
}

fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
    // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
    // we need the 9th field (Text), so split on comma but only take first 9 splits
    // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
    let text = dialogue_line.splitn(9, ',').last()?;

    // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
    let mut result = String::new();
    let mut in_tag = false;
    let mut char_iter = text.chars().peekable();

    while let Some(c) = char_iter.next() {
        if c == '{' && char_iter.peek() == Some(&'\\') {
            in_tag = true;
        } else if c == '}' {
            in_tag = false;
        } else if !in_tag {
            // process line breaks and hard spaces
            if c == '\\' {
                match char_iter.peek() {
                    Some(&'N') => {
                        char_iter.next();
                        result.push('\n');
                    }
                    Some(&'n') | Some(&'h') => {
                        char_iter.next();
                        result.push(' ');
                    }
                    _ => result.push(c),
                }
            } else {
                result.push(c);
            }
        }
    }

    Some(result)
}