1 files changed, 118 insertions, 0 deletions
diff --git a/src/subtitle_extraction/embedded.rs b/src/subtitle_extraction/embedded.rs
new file mode 100644
index 0000000..5cdf813
--- /dev/null
+++ b/src/subtitle_extraction/embedded.rs
@@ -0,0 +1,118 @@
+use std::sync::mpsc;
+
+use anyhow::Context;
+
+use crate::subtitle_extraction::*;
+
+pub fn extract_embedded_subtitles(
+    // stream index to use when storing extracted subtitles, this index already
+    // has to be in TRACKS when this function is called!
+    stream_ix: StreamIndex,
+    context: ffmpeg::codec::Context,
+    time_base: ffmpeg::Rational,
+    packet_rx: mpsc::Receiver<ffmpeg::Packet>,
+    sender: ComponentSender<SubtitleExtractor>,
+) -> anyhow::Result<()> {
+    let mut decoder = context
+        .decoder()
+        .subtitle()
+        .with_context(|| format!("error creating subtitle decoder for stream {}", stream_ix))?;
+
+    while let Ok(packet) = packet_rx.recv() {
+        let mut subtitle = ffmpeg::Subtitle::new();
+        match decoder.decode(&packet, &mut subtitle) {
+            Ok(true) => {
+                if let Some(cue) = parse_subtitle(&subtitle, &packet, time_base) {
+                    SUBTITLE_TRACKS
+                        .write()
+                        .get_mut(&stream_ix)
+                        .unwrap()
+                        .cues
+                        .push(cue.clone());
+                    sender
+                        .output(SubtitleExtractorOutput::NewCue(stream_ix, cue))
+                        .unwrap();
+                } else {
+                    log::error!("error parsing subtitle at pts {:?}", packet.pts())
+                }
+            }
+            Ok(false) => {
+                log::debug!("got empty (?) subtitle, not sure if this should ever happen");
+            }
+            Err(e) => {
+                log::error!("error decoding subtitle: {:?}", e)
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn parse_subtitle(
+    subtitle: &ffmpeg::Subtitle,
+    packet: &ffmpeg::Packet,
+    time_base: Rational,
+) -> Option<SubtitleCue> {
+    let time_to_clock_time = |time: i64| {
+        let nseconds: i64 =
+            (time * time_base.numerator() as i64 * 1_000_000_000) / time_base.denominator() as i64;
+        gst::ClockTime::from_nseconds(nseconds as u64)
+    };
+
+    let text = subtitle
+        .rects()
+        .into_iter()
+        .map(|rect| match rect {
+            ffmpeg::subtitle::Rect::Text(text) => text.get().to_string(),
+            ffmpeg::subtitle::Rect::Ass(ass) => {
+                extract_dialogue_text(ass.get()).unwrap_or(String::new())
+            }
+            _ => String::new(),
+        })
+        .collect::<Vec<String>>()
+        .join("\n— ");
+
+    let start = time_to_clock_time(packet.pts()?);
+    let end = time_to_clock_time(packet.pts()? + packet.duration());
+
+    Some(SubtitleCue { start, end, text })
+}
+
+fn extract_dialogue_text(dialogue_line: &str) -> Option<String> {
+    // ASS dialogue format: ReadOrder,Layer,Style,Name,MarginL,MarginR,MarginV,Effect,Text
+    // we need the 9th field (Text), so split on comma but only take first 9 splits
+    // see also https://github.com/FFmpeg/FFmpeg/blob/a700f0f72d1f073e5adcfbb16f4633850b0ef51c/libavcodec/ass_split.c#L433
+    let text = dialogue_line.splitn(9, ',').last()?;
+
+    // remove ASS override codes (formatting tags) like {\b1}, {\i1}, {\c&Hffffff&}, etc.
+    let mut result = String::new();
+    let mut in_tag = false;
+    let mut char_iter = text.chars().peekable();
+
+    while let Some(c) = char_iter.next() {
+        if c == '{' && char_iter.peek() == Some(&'\\') {
+            in_tag = true;
+        } else if c == '}' {
+            in_tag = false;
+        } else if !in_tag {
+            // process line breaks and hard spaces
+            if c == '\\' {
+                match char_iter.peek() {
+                    Some(&'N') => {
+                        char_iter.next();
+                        result.push('\n');
+                    }
+                    Some(&'n') | Some(&'h') => {
+                        char_iter.next();
+                        result.push(' ');
+                    }
+                    _ => result.push(c),
+                }
+            } else {
+                result.push(c);
+            }
+        }
+    }
+
+    Some(result)
+}