transcribe-voice-track allow multiple wav args
This commit is contained in:
@@ -1,6 +1,9 @@
|
|||||||
#! /usr/bin/env python
|
#! /usr/bin/env python
|
||||||
# pip install -r requirements.txt
|
# pip install -r requirements.txt
|
||||||
usage = 'python transcribe-voice-track.py <?wav filename>'
|
usage = 'python transcribe-voice-track.py <wav filenames...> '
|
||||||
|
|
||||||
|
# https://towardsdatascience.com/speech-recognition-with-timestamps-934ede4234b2
|
||||||
|
# If you don't get results, try re-exporting as Signed 16-bit PCM
|
||||||
|
|
||||||
import util
|
import util
|
||||||
import wave
|
import wave
|
||||||
@@ -29,16 +32,19 @@ if not os.path.exists(model_path):
|
|||||||
with ZipFile(model_zip_path, "r") as zip_file:
|
with ZipFile(model_zip_path, "r") as zip_file:
|
||||||
zip_file.extractall('models')
|
zip_file.extractall('models')
|
||||||
|
|
||||||
audio_filename = util.arg(1, usage)
|
|
||||||
wf = wave.open(audio_filename, "rb")
|
|
||||||
|
|
||||||
model = Model(model_path)
|
model = Model(model_path)
|
||||||
rec = KaldiRecognizer(model, wf.getframerate())
|
|
||||||
rec.SetWords(True)
|
|
||||||
|
|
||||||
frames = 4000
|
audio_filenames = util.args(1, usage)
|
||||||
while True:
|
for audio_filename in audio_filenames:
|
||||||
|
wf = wave.open(audio_filename, "rb")
|
||||||
|
|
||||||
|
rec = KaldiRecognizer(model, wf.getframerate())
|
||||||
|
rec.SetWords(True)
|
||||||
|
|
||||||
|
frames = 4000
|
||||||
|
|
||||||
# Mix channels together if the input is stereo
|
# Mix channels together if the input is stereo
|
||||||
|
# or the sample width is incompatible
|
||||||
if wf.getnchannels() == 2:
|
if wf.getnchannels() == 2:
|
||||||
wf.close()
|
wf.close()
|
||||||
mono_filename = '.'.join(audio_filename.split('.')[:-1]) + '_mono.wav'
|
mono_filename = '.'.join(audio_filename.split('.')[:-1]) + '_mono.wav'
|
||||||
@@ -72,9 +78,3 @@ while True:
|
|||||||
lines[text].append({'start': words[0]['start'], 'end': words[-1]['end']})
|
lines[text].append({'start': words[0]['start'], 'end': words[-1]['end']})
|
||||||
print(f'{text}: {words[0]["start"]} {words[-1]["end"]}')
|
print(f'{text}: {words[0]["start"]} {words[-1]["end"]}')
|
||||||
json.dump(lines, f)
|
json.dump(lines, f)
|
||||||
|
|
||||||
frames = input(f"Try different frames num? (was {frames}) (press ENTER to quit): ")
|
|
||||||
if len(frames) == 0:
|
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
frames = int(frames)
|
|
||||||
|
@@ -8,4 +8,14 @@ def arg(num, usage, default=None):
|
|||||||
if default != None:
|
if default != None:
|
||||||
return default
|
return default
|
||||||
raise ValueError(usage)
|
raise ValueError(usage)
|
||||||
return val
|
return val
|
||||||
|
|
||||||
|
def args(starting_num, usage, default=None):
|
||||||
|
l = []
|
||||||
|
if len(sys.argv) > starting_num:
|
||||||
|
l = sys.argv[starting_num:]
|
||||||
|
else:
|
||||||
|
if default != None:
|
||||||
|
return default
|
||||||
|
raise ValueError(usage)
|
||||||
|
return l
|
Reference in New Issue
Block a user