Fitting a .wav file to target duration by speeding/slowing in python

51 views Asked by At

I am currently trying to generate a TTS file of a subtitle file I generated. It's a basic loading and processing of the file where the timestamps' lines always start with a ##. Code for cleaning subs:

duration = VideoFileClip(f"media/{name}.mp4").duration
for i in range(len(lines)):
    if '##' in lines[i]:
        timestamp = lines[i].replace('## ', '').replace(' :', '').replace(' :\n', '')
        start_time, end_time = map(float, timestamp.split(" - "))
        if lines[i + 1] == '\n':
            subtitle_entries.append({"start": start_time, "end": end_time,
                                     "text": "<empty>"})
    elif lines[i] == '\n':
        continue
    else:
        subtitle_entries.append({"start": start_time, "end": end_time,
                                 "text": lines[i].lstrip() if lines[i].lstrip() != '' else "<empty>"})
        data += lines[i][:-1] + ' '

final_entries = []
if subtitle_entries[0]['start'] != 0:
    final_entries.append({"start": 0, "end": subtitle_entries[0]['start'], "text": "<empty>"})

for i in range(len(subtitle_entries)):
    if i != 0 and subtitle_entries[i - 1]['end'] != subtitle_entries[i]['start']:
        final_entries.append({"start": subtitle_entries[i - 1]['end'], "end": subtitle_entries[i]['start'],
                              "text": "<empty>"})
    final_entries.append(subtitle_entries[i])

if subtitle_entries[-1]['end'] != total_dur:
    final_entries.append({"start": subtitle_entries[-1]['end'], "end": total_dur, "text": "<empty>"})

Just creates parts of objects to pass them to the TTS model. This is my logic of generating the final TTS audio:

from pydub import AudioSegment
for entry in final_entries:
    if entry['text'] == "<empty>":
        audio_files.append(AudioSegment.silent(duration=(entry['end'] - entry['start']) * 1000))
    else:
        tts.tts_to_file(text=entry['text'], file_path=f"media/{name}/{entry['end']}.wav")

        audio_file = AudioSegment.from_wav(f"media/{name}/{entry['end']}.wav")
        original_dur = len(audio_file)
        expected_dur = (entry['end'] - entry['start']) * 1000

        # Speedup or slow down the audio file here

        audio_files.append(audio_file)

The problem is, I have tried using librosa and pydub to speedup/slowdown the TTS file to match the given duration so that I can embed my audio to my targeted video file and it will sync up with what the speaker is speaking. I know I could have used commercial TTS APIs that do provide features of matching duration with TTS but I want to limit myself to Open-source APIs and so I would like to ask for help for writing a logic that essentially speeds up or slows down the audio by some factor (more like stretch or shrink) to match the given duration. How do I acheive that using libraries like librosa, pydub or moviepy?

I have tried using librosa, here are my error logs and code for the librosa implementation:

Code

for entry in final_entries:
    if entry['text'] == "<empty>":
        audio_files.append(AudioSegment.silent(duration=(entry['end'] - entry['start']) * 1000))
    else:
        tts.tts_to_file(text=entry['text'], file_path=f"media/{name}/{entry['end']}.wav")

        audio_file = AudioSegment.from_wav(f"media/{name}/{entry['end']}.wav")
        original_durr = len(audio_file)
        expected_durr = (entry['end'] - entry['start']) * 1000

        y, sr = librosa.load(f"media/{name}/{entry['end']}.wav")
        y_output = librosa.effects.time_stretch(y, rate=(expected_durr / original_durr))
        sf.write(f"media/{name}/{entry['end']}.wav", y_output, sr)

        audio_file = AudioSegment.from_wav(f"media/{name}/{entry['end']}.wav")
        audio_files.append(audio_file)

Error logs

Internal Server Error: /video_downloader/download/
Traceback (most recent call last):
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\exception.py", line 55, in inner
    response = get_response(request)
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\base.py", line 197, in _get_response
    response = wrapped_callback(request, *callback_args, **callback_kwargs)
  File "C:\Academics\transcriber\video_downloader\views.py", line 158, in download_video
    audio_file = tts_convertor(translated_file, target_language, name)
  File "C:\Academics\transcriber\video_processor\processor.py", line 148, in tts_convertor
    y_output = librosa.effects.time_stretch(y, rate=(expected_durr / original_durr))
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\effects.py", line 245, in time_stretch
    stft_stretch = core.phase_vocoder(
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\core\spectrum.py", line 1457, in phase_vocoder
    d_stretch[..., t] = util.phasor(phase_acc, mag=mag)
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\util\utils.py", line 2602, in phasor
    z = _phasor_angles(angles)
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numba\np\ufunc\dufunc.py", line 190, in __call__
    return super().__call__(*args, **kws)
numpy.core._exceptions._UFuncNoLoopError: ufunc '_phasor_angles' did not contain a loop with signature matching types <class 'numpy.dtype[float32]'> -> None
ERROR:django.request:Internal Server Error: /video_downloader/download/
Traceback (most recent call last):
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\exception.py", line 55, in inner
    response = get_response(request)
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\django\core\handlers\base.py", line 197, in _get_response
    response = wrapped_callback(request, *callback_args, **callback_kwargs)
  File "C:\Academics\transcriber\video_downloader\views.py", line 158, in download_video
    audio_file = tts_convertor(translated_file, target_language, name)
  File "C:\Academics\transcriber\video_processor\processor.py", line 148, in tts_convertor
    y_output = librosa.effects.time_stretch(y, rate=(expected_durr / original_durr))
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\effects.py", line 245, in time_stretch
    stft_stretch = core.phase_vocoder(
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\core\spectrum.py", line 1457, in phase_vocoder
    d_stretch[..., t] = util.phasor(phase_acc, mag=mag)
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\util\utils.py", line 2602, in phasor
    z = _phasor_angles(angles)
  File "C:\Users\leofi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numba\np\ufunc\dufunc.py", line 190, in __call__
    return super().__call__(*args, **kws)
numpy.core._exceptions._UFuncNoLoopError: ufunc '_phasor_angles' did not contain a loop with signature matching types <class 'numpy.dtype[float32]'> -> None
[03/Dec/2023 14:29:23] "POST /video_downloader/download/ HTTP/1.1" 500 141577
0

There are 0 answers