diff --git a/scripts/env.sh b/scripts/env.sh index f5553e3..85cb51c 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -4,7 +4,7 @@ source_and_alias() { source env/bin/activate SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) - alias transcribe-voice-track="python ${SCRIPT_DIR}/transcribe-voice-track.py" + alias transcribe-voice-tracks="python ${SCRIPT_DIR}/transcribe-voice-tracks.py" alias amplify-voice-tracks="python ${SCRIPT_DIR}/amplify-voice-tracks.py" alias combine-voice-tracks="python ${SCRIPT_DIR}/combine-voice-tracks.py" alias cut-voice-track="python ${SCRIPT_DIR}/cut-voice-track.py" diff --git a/scripts/transcribe-voice-track.py b/scripts/transcribe-voice-track.py deleted file mode 100644 index 74811f6..0000000 --- a/scripts/transcribe-voice-track.py +++ /dev/null @@ -1,77 +0,0 @@ -#! /usr/bin/env python -# pip install -r requirements.txt -usage = 'python transcribe-voice-track.py ' - -# https://towardsdatascience.com/speech-recognition-with-timestamps-934ede4234b2 -# If you don't get results, try re-exporting as Signed 16-bit PCM - -import util -import os -import requests -from zipfile import ZipFile -from imports import * - -from vosk import Model, KaldiRecognizer - -model_name = "vosk-model-small-en-us-0.15" -# model_name = "vosk-model-en-us-0.22" # bigger model. So far, the small one works pretty good. - -model_path = f"models/{model_name}" -model_zip_path = f"{model_path}.zip" -model_url = f"http://alphacephei.com/vosk/{model_zip_path}" - -# Download the model if it doesn't exist -os.makedirs('models', exist_ok=True) -if not os.path.exists(model_path): - with open(model_zip_path, 'wb') as f: - response = requests.get(model_url) - f.write(response.content) - with ZipFile(model_zip_path, "r") as zip_file: - zip_file.extractall('models') - -model = Model(model_path) - -audio_filenames = util.args(1, usage) -for audio_filename in audio_filenames: - wf = wave.open(audio_filename, "rb") - - rec = KaldiRecognizer(model, wf.getframerate()) - rec.SetWords(True) - - frames = 4000 - - # Mix channels together if the input is stereo - # or the sample width is incompatible - if wf.getnchannels() == 2: - wf.close() - mono_filename = '.'.join(audio_filename.split('.')[:-1]) + '_mono.wav' - fs, data = wavfile.read(audio_filename) - wavfile.write(mono_filename, fs, data[:, 0]) - wf = wave.open(mono_filename, 'rb') - - wf.rewind() - results = [] - def filter_result(result): - result = json.loads(result) - if len(result) != 1: - results.append(result) - - while True: - data = wf.readframes(frames) - if len(data) == 0: - break - if rec.AcceptWaveform(data): - filter_result(rec.Result()) - filter_result(rec.FinalResult()) - - with open(f"{audio_filename}_{frames}.json", "w") as f: - lines = {} - for sentence in results: - words = sentence['result'] - text = sentence['text'] - # Account for duplicate sentences: - if not text in lines: - lines[text] = [] - lines[text].append({'start': words[0]['start'], 'end': words[-1]['end']}) - print(f'{text}: {words[0]["start"]} {words[-1]["end"]}') - json.dump(lines, f) diff --git a/scripts/transcribe-voice-tracks.py b/scripts/transcribe-voice-tracks.py new file mode 100644 index 0000000..1f9d153 --- /dev/null +++ b/scripts/transcribe-voice-tracks.py @@ -0,0 +1,105 @@ +#! /usr/bin/env python +# pip install -r requirements.txt +usage = 'python transcribe-voice-track.py ' + +import util +from imports import * + +import whisper +model = whisper.load_model("turbo") + +import re +punc_list = ['.', '!', '?', ';', '--'] +re_punc_list = [re.escape(punc) for punc in punc_list] + +fountain_file = util.arg(1, usage) +character = util.arg(2, usage) +audio_filenames = util.args(3, usage) + +lines = "" +with open(fountain_file, 'r') as f: + lines = f.readlines() + +# Put a list of dialog lines wanted into a FuzzyMap +fmap = util.FuzzyMap() +all_partials = {} +prev_found = {} + +idx = 0 +while idx < len(lines) - 1: + line = lines[idx].strip() + idx += 1 + + # If it ends with punctuation, it's probably a screen line! + for punc in punc_list: + if line.endswith(punc): + continue + + if len(line) == 0: + continue + + # If it has lower-case letters, it's not a speech name + tokens = line.split(" ") + all_upper = True + for token in tokens: + if token.upper() != token: + all_upper = False + break + + # It's probably a speech name + if all_upper: + name = line + if '(' in name: + name = name[:name.find('(')].strip() + + line = lines[idx].strip() + idx += 1 + + # Skip wryly lines + if line.startswith('('): + line = line[line.find(')') + 1:].strip() + if len(line) == 0: + line = lines[idx].strip() + idx += 1 + + if character.upper() != name: + continue + + # Put the line in the map + fmap.put(line, []) + # TODO this is experimental: + # Put each part of the line in the map, so we can try to catch partial parts! + partials = re.split('|'.join(re_punc_list), line) + if len(partials) > 1: + for part in partials: + part = part.strip() + fmap.put(part, []) + all_partials[part] = True + +map = fmap.map +print(map) + +for audio_filename in audio_filenames: + result = model.transcribe(audio_filename) + print(result['segments']) + for segment in result['segments']: + match = fmap.best_match(segment['text']) + if match in all_partials: + print(f'PARTIAL FOUND: {match}') + match_list = fmap.map[match] + if match_list is not None: + match_list.append({'start': segment['start'], 'end': segment['end']}) + + to_dump = {} + for key in list(map.keys()): + if len(map[key]) != 0: + to_dump[key] = map[key] + prev_found[key] = True + map[key] = [] + + with open(f"{audio_filename}.{character}.json", "w") as f: + json.dump(to_dump, f) + +for key in map.keys(): + if key not in all_partials and len(map[key]) == 0 and key not in prev_found: + print(f'NOT FOUND: {key}')