I am currently trying to generate a TTS file of a subtitle file I generated. It's a basic loading and processing of the file where the timestamps' lines always start with a ##. Code for cleaning subs:
duration = VideoFileClip(f"media/{name}.mp4").duration
for i in range(len(lines)):
if '##' in lines[i]:
timestamp = lines[i].replace('## ', '').replace(' :', '').replace(' :\n', '')
start_time, end_time = map(float, timestamp.split(" - "))
if lines[i + 1] == '\n':
subtitle_entries.append({"start": start_time, "end": end_time,
"text": "<empty>"})
elif lines[i] == '\n':
continue
else:
subtitle_entries.append({"start": start_time, "end": end_time,
"text": lines[i].lstrip() if lines[i].lstrip() != '' else "<empty>"})
data += lines[i][:-1] + ' '
final_entries = []
if subtitle_entries[0]['start'] != 0:
final_entries.append({"start": 0, "end": subtitle_entries[0]['start'], "text": "<empty>"})
for i in range(len(subtitle_entries)):
if i != 0 and subtitle_entries[i - 1]['end'] != subtitle_entries[i]['start']:
final_entries.append({"start": subtitle_entries[i - 1]['end'], "end": subtitle_entries[i]['start'],
"text": "<empty>"})
final_entries.append(subtitle_entries[i])
if subtitle_entries[-1]['end'] != total_dur:
final_entries.append({"start": subtitle_entries[-1]['end'], "end": total_dur, "text": "<empty>"})
Just creates parts of objects to pass them to the TTS model. This is my logic of generating the final TTS audio:
from pydub import AudioSegment
for entry in final_entries:
if entry['text'] == "<empty>":
audio_files.append(AudioSegment.silent(duration=(entry['end'] - entry['start']) * 1000))
else:
tts.tts_to_file(text=entry['text'], file_path=f"media/{name}/{entry['end']}.wav")
audio_file = AudioSegment.from_wav(f"media/{name}/{entry['end']}.wav")
original_dur = len(audio_file)
expected_dur = (entry['end'] - entry['start']) * 1000
# Speedup or slow down the audio file here
audio_files.append(audio_file)
The problem is, I have tried using librosa and pydub to speedup/slowdown the TTS file to match the given duration so that I can embed my audio to my targeted video file and it will sync up with what the speaker is speaking. I know I could have used commercial TTS APIs that do provide features of matching duration with TTS but I want to limit myself to Open-source APIs and so I would like to ask for help for writing a logic that essentially speeds up or slows down the audio by some factor (more like stretch or shrink) to match the given duration. How do I acheive that using libraries like librosa, pydub or moviepy?
I have tried using librosa, here are my error logs and code for the librosa implementation:
Code
for entry in final_entries:
if entry['text'] == "<empty>":
audio_files.append(AudioSegment.silent(duration=(entry['end'] - entry['start']) * 1000))
else:
tts.tts_to_file(text=entry['text'], file_path=f"media/{name}/{entry['end']}.wav")
audio_file = AudioSegment.from_wav(f"media/{name}/{entry['end']}.wav")
original_durr = len(audio_file)
expected_durr = (entry['end'] - entry['start']) * 1000
y, sr = librosa.load(f"media/{name}/{entry['end']}.wav")
y_output = librosa.effects.time_stretch(y, rate=(expected_durr / original_durr))
sf.write(f"media/{name}/{entry['end']}.wav", y_output, sr)
audio_file = AudioSegment.from_wav(f"media/{name}/{entry['end']}.wav")
audio_files.append(audio_file)
Error logs
Internal Server Error: /video_downloader/download/
Traceback (most recent call last):
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\exception.py", line 55, in inner
response = get_response(request)
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\base.py", line 197, in _get_response
response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "C:\Academics\transcriber\video_downloader\views.py", line 158, in download_video
audio_file = tts_convertor(translated_file, target_language, name)
File "C:\Academics\transcriber\video_processor\processor.py", line 148, in tts_convertor
y_output = librosa.effects.time_stretch(y, rate=(expected_durr / original_durr))
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\effects.py", line 245, in time_stretch
stft_stretch = core.phase_vocoder(
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\core\spectrum.py", line 1457, in phase_vocoder
d_stretch[..., t] = util.phasor(phase_acc, mag=mag)
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\util\utils.py", line 2602, in phasor
z = _phasor_angles(angles)
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numba\np\ufunc\dufunc.py", line 190, in __call__
return super().__call__(*args, **kws)
numpy.core._exceptions._UFuncNoLoopError: ufunc '_phasor_angles' did not contain a loop with signature matching types <class 'numpy.dtype[float32]'> -> None
ERROR:django.request:Internal Server Error: /video_downloader/download/
Traceback (most recent call last):
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\exception.py", line 55, in inner
response = get_response(request)
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\base.py", line 197, in _get_response
response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "C:\Academics\transcriber\video_downloader\views.py", line 158, in download_video
audio_file = tts_convertor(translated_file, target_language, name)
File "C:\Academics\transcriber\video_processor\processor.py", line 148, in tts_convertor
y_output = librosa.effects.time_stretch(y, rate=(expected_durr / original_durr))
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\effects.py", line 245, in time_stretch
stft_stretch = core.phase_vocoder(
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\core\spectrum.py", line 1457, in phase_vocoder
d_stretch[..., t] = util.phasor(phase_acc, mag=mag)
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\util\utils.py", line 2602, in phasor
z = _phasor_angles(angles)
File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numba\np\ufunc\dufunc.py", line 190, in __call__
return super().__call__(*args, **kws)
numpy.core._exceptions._UFuncNoLoopError: ufunc '_phasor_angles' did not contain a loop with signature matching types <class 'numpy.dtype[float32]'> -> None
[03/Dec/2023 14:29:23] "POST /video_downloader/download/ HTTP/1.1" 500 141577