transcribe-voice-tracks use whisper and search for lines
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
source_and_alias() {
|
source_and_alias() {
|
||||||
source env/bin/activate
|
source env/bin/activate
|
||||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
alias transcribe-voice-track="python ${SCRIPT_DIR}/transcribe-voice-track.py"
|
alias transcribe-voice-tracks="python ${SCRIPT_DIR}/transcribe-voice-tracks.py"
|
||||||
alias amplify-voice-tracks="python ${SCRIPT_DIR}/amplify-voice-tracks.py"
|
alias amplify-voice-tracks="python ${SCRIPT_DIR}/amplify-voice-tracks.py"
|
||||||
alias combine-voice-tracks="python ${SCRIPT_DIR}/combine-voice-tracks.py"
|
alias combine-voice-tracks="python ${SCRIPT_DIR}/combine-voice-tracks.py"
|
||||||
alias cut-voice-track="python ${SCRIPT_DIR}/cut-voice-track.py"
|
alias cut-voice-track="python ${SCRIPT_DIR}/cut-voice-track.py"
|
||||||
|
@@ -1,77 +0,0 @@
|
|||||||
#! /usr/bin/env python
|
|
||||||
# pip install -r requirements.txt
|
|
||||||
usage = 'python transcribe-voice-track.py <wav filenames...> '
|
|
||||||
|
|
||||||
# https://towardsdatascience.com/speech-recognition-with-timestamps-934ede4234b2
|
|
||||||
# If you don't get results, try re-exporting as Signed 16-bit PCM
|
|
||||||
|
|
||||||
import util
|
|
||||||
import os
|
|
||||||
import requests
|
|
||||||
from zipfile import ZipFile
|
|
||||||
from imports import *
|
|
||||||
|
|
||||||
from vosk import Model, KaldiRecognizer
|
|
||||||
|
|
||||||
model_name = "vosk-model-small-en-us-0.15"
|
|
||||||
# model_name = "vosk-model-en-us-0.22" # bigger model. So far, the small one works pretty good.
|
|
||||||
|
|
||||||
model_path = f"models/{model_name}"
|
|
||||||
model_zip_path = f"{model_path}.zip"
|
|
||||||
model_url = f"http://alphacephei.com/vosk/{model_zip_path}"
|
|
||||||
|
|
||||||
# Download the model if it doesn't exist
|
|
||||||
os.makedirs('models', exist_ok=True)
|
|
||||||
if not os.path.exists(model_path):
|
|
||||||
with open(model_zip_path, 'wb') as f:
|
|
||||||
response = requests.get(model_url)
|
|
||||||
f.write(response.content)
|
|
||||||
with ZipFile(model_zip_path, "r") as zip_file:
|
|
||||||
zip_file.extractall('models')
|
|
||||||
|
|
||||||
model = Model(model_path)
|
|
||||||
|
|
||||||
audio_filenames = util.args(1, usage)
|
|
||||||
for audio_filename in audio_filenames:
|
|
||||||
wf = wave.open(audio_filename, "rb")
|
|
||||||
|
|
||||||
rec = KaldiRecognizer(model, wf.getframerate())
|
|
||||||
rec.SetWords(True)
|
|
||||||
|
|
||||||
frames = 4000
|
|
||||||
|
|
||||||
# Mix channels together if the input is stereo
|
|
||||||
# or the sample width is incompatible
|
|
||||||
if wf.getnchannels() == 2:
|
|
||||||
wf.close()
|
|
||||||
mono_filename = '.'.join(audio_filename.split('.')[:-1]) + '_mono.wav'
|
|
||||||
fs, data = wavfile.read(audio_filename)
|
|
||||||
wavfile.write(mono_filename, fs, data[:, 0])
|
|
||||||
wf = wave.open(mono_filename, 'rb')
|
|
||||||
|
|
||||||
wf.rewind()
|
|
||||||
results = []
|
|
||||||
def filter_result(result):
|
|
||||||
result = json.loads(result)
|
|
||||||
if len(result) != 1:
|
|
||||||
results.append(result)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
data = wf.readframes(frames)
|
|
||||||
if len(data) == 0:
|
|
||||||
break
|
|
||||||
if rec.AcceptWaveform(data):
|
|
||||||
filter_result(rec.Result())
|
|
||||||
filter_result(rec.FinalResult())
|
|
||||||
|
|
||||||
with open(f"{audio_filename}_{frames}.json", "w") as f:
|
|
||||||
lines = {}
|
|
||||||
for sentence in results:
|
|
||||||
words = sentence['result']
|
|
||||||
text = sentence['text']
|
|
||||||
# Account for duplicate sentences:
|
|
||||||
if not text in lines:
|
|
||||||
lines[text] = []
|
|
||||||
lines[text].append({'start': words[0]['start'], 'end': words[-1]['end']})
|
|
||||||
print(f'{text}: {words[0]["start"]} {words[-1]["end"]}')
|
|
||||||
json.dump(lines, f)
|
|
105
scripts/transcribe-voice-tracks.py
Normal file
105
scripts/transcribe-voice-tracks.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
#! /usr/bin/env python
|
||||||
|
# pip install -r requirements.txt
|
||||||
|
usage = 'python transcribe-voice-track.py <fountain filename> <character> <wav filenames...>'
|
||||||
|
|
||||||
|
import util
|
||||||
|
from imports import *
|
||||||
|
|
||||||
|
import whisper
|
||||||
|
model = whisper.load_model("turbo")
|
||||||
|
|
||||||
|
import re
|
||||||
|
punc_list = ['.', '!', '?', ';', '--']
|
||||||
|
re_punc_list = [re.escape(punc) for punc in punc_list]
|
||||||
|
|
||||||
|
fountain_file = util.arg(1, usage)
|
||||||
|
character = util.arg(2, usage)
|
||||||
|
audio_filenames = util.args(3, usage)
|
||||||
|
|
||||||
|
lines = ""
|
||||||
|
with open(fountain_file, 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
# Put a list of dialog lines wanted into a FuzzyMap
|
||||||
|
fmap = util.FuzzyMap()
|
||||||
|
all_partials = {}
|
||||||
|
prev_found = {}
|
||||||
|
|
||||||
|
idx = 0
|
||||||
|
while idx < len(lines) - 1:
|
||||||
|
line = lines[idx].strip()
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
# If it ends with punctuation, it's probably a screen line!
|
||||||
|
for punc in punc_list:
|
||||||
|
if line.endswith(punc):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(line) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If it has lower-case letters, it's not a speech name
|
||||||
|
tokens = line.split(" ")
|
||||||
|
all_upper = True
|
||||||
|
for token in tokens:
|
||||||
|
if token.upper() != token:
|
||||||
|
all_upper = False
|
||||||
|
break
|
||||||
|
|
||||||
|
# It's probably a speech name
|
||||||
|
if all_upper:
|
||||||
|
name = line
|
||||||
|
if '(' in name:
|
||||||
|
name = name[:name.find('(')].strip()
|
||||||
|
|
||||||
|
line = lines[idx].strip()
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
# Skip wryly lines
|
||||||
|
if line.startswith('('):
|
||||||
|
line = line[line.find(')') + 1:].strip()
|
||||||
|
if len(line) == 0:
|
||||||
|
line = lines[idx].strip()
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
if character.upper() != name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Put the line in the map
|
||||||
|
fmap.put(line, [])
|
||||||
|
# TODO this is experimental:
|
||||||
|
# Put each part of the line in the map, so we can try to catch partial parts!
|
||||||
|
partials = re.split('|'.join(re_punc_list), line)
|
||||||
|
if len(partials) > 1:
|
||||||
|
for part in partials:
|
||||||
|
part = part.strip()
|
||||||
|
fmap.put(part, [])
|
||||||
|
all_partials[part] = True
|
||||||
|
|
||||||
|
map = fmap.map
|
||||||
|
print(map)
|
||||||
|
|
||||||
|
for audio_filename in audio_filenames:
|
||||||
|
result = model.transcribe(audio_filename)
|
||||||
|
print(result['segments'])
|
||||||
|
for segment in result['segments']:
|
||||||
|
match = fmap.best_match(segment['text'])
|
||||||
|
if match in all_partials:
|
||||||
|
print(f'PARTIAL FOUND: {match}')
|
||||||
|
match_list = fmap.map[match]
|
||||||
|
if match_list is not None:
|
||||||
|
match_list.append({'start': segment['start'], 'end': segment['end']})
|
||||||
|
|
||||||
|
to_dump = {}
|
||||||
|
for key in list(map.keys()):
|
||||||
|
if len(map[key]) != 0:
|
||||||
|
to_dump[key] = map[key]
|
||||||
|
prev_found[key] = True
|
||||||
|
map[key] = []
|
||||||
|
|
||||||
|
with open(f"{audio_filename}.{character}.json", "w") as f:
|
||||||
|
json.dump(to_dump, f)
|
||||||
|
|
||||||
|
for key in map.keys():
|
||||||
|
if key not in all_partials and len(map[key]) == 0 and key not in prev_found:
|
||||||
|
print(f'NOT FOUND: {key}')
|
Reference in New Issue
Block a user