transcribe-voice-tracks use whisper and search for lines

2025-04-04 11:46:58 -05:00
parent 15962efe57
commit 9c29b97c86
3 changed files with 106 additions and 78 deletions
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -4,7 +4,7 @@
 source_and_alias() {
 	source env/bin/activate
 	SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-	alias transcribe-voice-track="python ${SCRIPT_DIR}/transcribe-voice-track.py"
+	alias transcribe-voice-tracks="python ${SCRIPT_DIR}/transcribe-voice-tracks.py"
 	alias amplify-voice-tracks="python ${SCRIPT_DIR}/amplify-voice-tracks.py"
 	alias combine-voice-tracks="python ${SCRIPT_DIR}/combine-voice-tracks.py"
 	alias cut-voice-track="python ${SCRIPT_DIR}/cut-voice-track.py"
--- a/scripts/transcribe-voice-track.py
+++ b/scripts/transcribe-voice-track.py
@@ -1,77 +0,0 @@
-#! /usr/bin/env python
-# pip install -r requirements.txt
-usage = 'python transcribe-voice-track.py <wav filenames...> '
-
-# https://towardsdatascience.com/speech-recognition-with-timestamps-934ede4234b2
-# If you don't get results, try re-exporting as Signed 16-bit PCM
-
-import util
-import os
-import requests
-from zipfile import ZipFile
-from imports import *
-
-from vosk import Model, KaldiRecognizer
-
-model_name = "vosk-model-small-en-us-0.15"
-# model_name = "vosk-model-en-us-0.22" # bigger model. So far, the small one works pretty good.
-
-model_path = f"models/{model_name}"
-model_zip_path = f"{model_path}.zip"
-model_url = f"http://alphacephei.com/vosk/{model_zip_path}"
-
-# Download the model if it doesn't exist
-os.makedirs('models', exist_ok=True)
-if not os.path.exists(model_path):
-    with open(model_zip_path, 'wb') as f:
-        response = requests.get(model_url)
-        f.write(response.content)
-    with ZipFile(model_zip_path, "r") as zip_file:
-        zip_file.extractall('models')
-
-model = Model(model_path)
-
-audio_filenames = util.args(1, usage)
-for audio_filename in audio_filenames:
-    wf = wave.open(audio_filename, "rb")
-
-    rec = KaldiRecognizer(model, wf.getframerate())
-    rec.SetWords(True)
-
-    frames = 4000
-    
-    # Mix channels together if the input is stereo
-    # or the sample width is incompatible
-    if wf.getnchannels() == 2:
-        wf.close()
-        mono_filename = '.'.join(audio_filename.split('.')[:-1]) + '_mono.wav'
-        fs, data = wavfile.read(audio_filename)
-        wavfile.write(mono_filename, fs, data[:, 0])
-        wf = wave.open(mono_filename, 'rb')
-
-    wf.rewind()
-    results = []
-    def filter_result(result):
-        result = json.loads(result)
-        if len(result) != 1:
-            results.append(result)
-
-    while True:
-        data = wf.readframes(frames)
-        if len(data) == 0:
-            break
-        if rec.AcceptWaveform(data):
-            filter_result(rec.Result())
-    filter_result(rec.FinalResult())
-    
-    with open(f"{audio_filename}_{frames}.json", "w") as f:
-        lines = {}
-        for sentence in results:
-            words = sentence['result']
-            text = sentence['text']
-            # Account for duplicate sentences:
-            if not text in lines:
-                lines[text] = []
-            lines[text].append({'start': words[0]['start'], 'end': words[-1]['end']})
-            print(f'{text}: {words[0]["start"]} {words[-1]["end"]}')
-        json.dump(lines, f)
--- a/scripts/transcribe-voice-tracks.py
+++ b/scripts/transcribe-voice-tracks.py
@@ -0,0 +1,105 @@
+#! /usr/bin/env python
+# pip install -r requirements.txt
+usage = 'python transcribe-voice-track.py <fountain filename> <character> <wav filenames...>'
+
+import util
+from imports import *
+
+import whisper
+model = whisper.load_model("turbo")
+
+import re
+punc_list = ['.', '!', '?', ';', '--']
+re_punc_list = [re.escape(punc) for punc in punc_list]
+
+fountain_file = util.arg(1, usage)
+character = util.arg(2, usage)
+audio_filenames = util.args(3, usage)
+
+lines = ""
+with open(fountain_file, 'r') as f:
+    lines = f.readlines()
+
+# Put a list of dialog lines wanted into a FuzzyMap
+fmap = util.FuzzyMap()
+all_partials = {}
+prev_found = {}
+
+idx = 0
+while idx < len(lines) - 1:
+    line = lines[idx].strip()
+    idx += 1
+
+    # If it ends with punctuation, it's probably a screen line!
+    for punc in punc_list:
+        if line.endswith(punc):
+            continue
+    
+    if len(line) == 0:
+        continue
+
+    # If it has lower-case letters, it's not a speech name
+    tokens = line.split(" ")
+    all_upper = True
+    for token in tokens:
+        if token.upper() != token:
+            all_upper = False
+            break
+
+    # It's probably a speech name
+    if all_upper:
+        name = line
+        if '(' in name:
+            name = name[:name.find('(')].strip()
+
+        line = lines[idx].strip()
+        idx += 1
+
+        # Skip wryly lines
+        if line.startswith('('):
+            line = line[line.find(')') + 1:].strip()
+            if len(line) == 0:
+                line = lines[idx].strip()
+                idx += 1
+
+        if character.upper() != name:
+            continue
+
+        # Put the line in the map
+        fmap.put(line, [])
+        # TODO this is experimental:
+        # Put each part of the line in the map, so we can try to catch partial parts!
+        partials = re.split('|'.join(re_punc_list), line)
+        if len(partials) > 1:
+            for part in partials:
+                part = part.strip()
+                fmap.put(part, [])
+                all_partials[part] = True
+
+map = fmap.map
+print(map)
+
+for audio_filename in audio_filenames:
+    result = model.transcribe(audio_filename)
+    print(result['segments'])
+    for segment in result['segments']:
+        match = fmap.best_match(segment['text'])
+        if match in all_partials:
+            print(f'PARTIAL FOUND: {match}')
+        match_list = fmap.map[match]
+        if match_list is not None:
+            match_list.append({'start': segment['start'], 'end': segment['end']})
+
+    to_dump = {}
+    for key in list(map.keys()):
+        if len(map[key]) != 0:
+            to_dump[key] = map[key]
+            prev_found[key] = True
+            map[key] = []
+
+    with open(f"{audio_filename}.{character}.json", "w") as f:
+        json.dump(to_dump, f)
+
+for key in map.keys():
+    if key not in all_partials and len(map[key]) == 0 and key not in prev_found:
+        print(f'NOT FOUND: {key}')