transcribe-voice-tracks use whisper and search for lines

This commit is contained in:
2025-04-04 11:46:58 -05:00
parent 15962efe57
commit 9c29b97c86
3 changed files with 106 additions and 78 deletions

View File

@@ -4,7 +4,7 @@
source_and_alias() {
source env/bin/activate
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
alias transcribe-voice-track="python ${SCRIPT_DIR}/transcribe-voice-track.py"
alias transcribe-voice-tracks="python ${SCRIPT_DIR}/transcribe-voice-tracks.py"
alias amplify-voice-tracks="python ${SCRIPT_DIR}/amplify-voice-tracks.py"
alias combine-voice-tracks="python ${SCRIPT_DIR}/combine-voice-tracks.py"
alias cut-voice-track="python ${SCRIPT_DIR}/cut-voice-track.py"

View File

@@ -1,77 +0,0 @@
#! /usr/bin/env python
# pip install -r requirements.txt
usage = 'python transcribe-voice-track.py <wav filenames...> '
# https://towardsdatascience.com/speech-recognition-with-timestamps-934ede4234b2
# If you don't get results, try re-exporting as Signed 16-bit PCM
import util
import os
import requests
from zipfile import ZipFile
from imports import *
from vosk import Model, KaldiRecognizer
model_name = "vosk-model-small-en-us-0.15"
# model_name = "vosk-model-en-us-0.22" # bigger model. So far, the small one works pretty good.
model_path = f"models/{model_name}"
model_zip_path = f"{model_path}.zip"
model_url = f"http://alphacephei.com/vosk/{model_zip_path}"
# Download the model if it doesn't exist
os.makedirs('models', exist_ok=True)
if not os.path.exists(model_path):
with open(model_zip_path, 'wb') as f:
response = requests.get(model_url)
f.write(response.content)
with ZipFile(model_zip_path, "r") as zip_file:
zip_file.extractall('models')
model = Model(model_path)
audio_filenames = util.args(1, usage)
for audio_filename in audio_filenames:
wf = wave.open(audio_filename, "rb")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)
frames = 4000
# Mix channels together if the input is stereo
# or the sample width is incompatible
if wf.getnchannels() == 2:
wf.close()
mono_filename = '.'.join(audio_filename.split('.')[:-1]) + '_mono.wav'
fs, data = wavfile.read(audio_filename)
wavfile.write(mono_filename, fs, data[:, 0])
wf = wave.open(mono_filename, 'rb')
wf.rewind()
results = []
def filter_result(result):
result = json.loads(result)
if len(result) != 1:
results.append(result)
while True:
data = wf.readframes(frames)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
filter_result(rec.Result())
filter_result(rec.FinalResult())
with open(f"{audio_filename}_{frames}.json", "w") as f:
lines = {}
for sentence in results:
words = sentence['result']
text = sentence['text']
# Account for duplicate sentences:
if not text in lines:
lines[text] = []
lines[text].append({'start': words[0]['start'], 'end': words[-1]['end']})
print(f'{text}: {words[0]["start"]} {words[-1]["end"]}')
json.dump(lines, f)

View File

@@ -0,0 +1,105 @@
#! /usr/bin/env python
# pip install -r requirements.txt
usage = 'python transcribe-voice-track.py <fountain filename> <character> <wav filenames...>'
import util
from imports import *
import whisper
model = whisper.load_model("turbo")
import re
punc_list = ['.', '!', '?', ';', '--']
re_punc_list = [re.escape(punc) for punc in punc_list]
fountain_file = util.arg(1, usage)
character = util.arg(2, usage)
audio_filenames = util.args(3, usage)
lines = ""
with open(fountain_file, 'r') as f:
lines = f.readlines()
# Put a list of dialog lines wanted into a FuzzyMap
fmap = util.FuzzyMap()
all_partials = {}
prev_found = {}
idx = 0
while idx < len(lines) - 1:
line = lines[idx].strip()
idx += 1
# If it ends with punctuation, it's probably a screen line!
for punc in punc_list:
if line.endswith(punc):
continue
if len(line) == 0:
continue
# If it has lower-case letters, it's not a speech name
tokens = line.split(" ")
all_upper = True
for token in tokens:
if token.upper() != token:
all_upper = False
break
# It's probably a speech name
if all_upper:
name = line
if '(' in name:
name = name[:name.find('(')].strip()
line = lines[idx].strip()
idx += 1
# Skip wryly lines
if line.startswith('('):
line = line[line.find(')') + 1:].strip()
if len(line) == 0:
line = lines[idx].strip()
idx += 1
if character.upper() != name:
continue
# Put the line in the map
fmap.put(line, [])
# TODO this is experimental:
# Put each part of the line in the map, so we can try to catch partial parts!
partials = re.split('|'.join(re_punc_list), line)
if len(partials) > 1:
for part in partials:
part = part.strip()
fmap.put(part, [])
all_partials[part] = True
map = fmap.map
print(map)
for audio_filename in audio_filenames:
result = model.transcribe(audio_filename)
print(result['segments'])
for segment in result['segments']:
match = fmap.best_match(segment['text'])
if match in all_partials:
print(f'PARTIAL FOUND: {match}')
match_list = fmap.map[match]
if match_list is not None:
match_list.append({'start': segment['start'], 'end': segment['end']})
to_dump = {}
for key in list(map.keys()):
if len(map[key]) != 0:
to_dump[key] = map[key]
prev_found[key] = True
map[key] = []
with open(f"{audio_filename}.{character}.json", "w") as f:
json.dump(to_dump, f)
for key in map.keys():
if key not in all_partials and len(map[key]) == 0 and key not in prev_found:
print(f'NOT FOUND: {key}')