Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inbuilt subtitle file generation support for podcasts and YouTube videos #2751

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,11 @@ def tts(
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
generate_subtitles : bool = False,
subtitle_file_path : str = "output.srt",
subtitle_batch_time : int = 3, #basically text of how much time should be shown in one line (couldn't find a better name)
beforeTimeMargin = 0,
afterTimeMargin = 0,
**kwargs,
):
"""Convert text to speech.
Expand Down Expand Up @@ -549,6 +554,11 @@ def tts(
style_wav=None,
style_text=None,
reference_speaker_name=None,
generate_subtitles= generate_subtitles,
subtitle_file_path= subtitle_file_path,
subtitle_batch_time= subtitle_batch_time,
beforeTimeMargin= beforeTimeMargin,
afterTimeMargin= afterTimeMargin,
**kwargs,
)
return wav
Expand All @@ -562,6 +572,11 @@ def tts_to_file(
emotion: str = "Neutral",
speed: float = 1.0,
file_path: str = "output.wav",
generate_subtitles : bool = False,
subtitle_file_path : str = "output.srt",
subtitle_batch_time : int = 3, #basically text of how much time should be shown in one line (couldn't find a better name)
beforeTimeMargin = 0,
afterTimeMargin = 0,
**kwargs,
):
"""Convert text to speech.
Expand Down Expand Up @@ -593,7 +608,7 @@ def tts_to_file(
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, generate_subtitles=generate_subtitles, subtitle_file_path=subtitle_file_path, subtitle_batch_time=subtitle_batch_time, beforeTimeMargin=beforeTimeMargin, afterTimeMargin=afterTimeMargin, **kwargs)
self.synthesizer.save_wav(wav=wav, path=file_path)
return file_path

Expand Down
120 changes: 120 additions & 0 deletions TTS/utils/synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import os
import time
from typing import List
Expand Down Expand Up @@ -247,6 +248,93 @@ def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
return output_wav

def get_subtitle_time(
self,
subtitle_time
):
milliseconds = str(math.floor((subtitle_time % 1) * 1000)).strip()
seconds = str(math.floor(subtitle_time % 60)).strip()
minutes = str(math.floor(subtitle_time / 60)).strip()
hours = str(math.floor(subtitle_time / 3600)).strip()
return hours + ":" + minutes + ":" + seconds + "," + milliseconds

def generate_subtitles(
self,
subtitles,
beforeTimeMargin = 0,
afterTimeMargin = 0,
subtitle_batch_time = 3,
subtitle_file_path = "output.srt"
):
writables = [] # this is the list of writable object which is like the final processed subtitle which can be written in file
for subtitle in subtitles:
text = str(subtitle["text"]) # str cast for intellisense during development
start_time = subtitle["start_time"]
end_time = subtitle["end_time"]
duration = end_time - start_time

recom_parts_to_break = math.floor(duration / subtitle_batch_time) + 1

recom_char_count = math.floor(len(text) / recom_parts_to_break) + 1

# print(f"recom_char_count: {recom_char_count}")

words = text.split(" ")

parts = []
buffer = ""
bufferLength = 0 # i can use len() but decided to go with a variable
for word in words:
if len(word) + bufferLength > recom_char_count:
parts.append(buffer)
buffer = ""
bufferLength = 0
if buffer != "":
buffer += " "
buffer += word
bufferLength += len(word)

if buffer != "": # TODO: refactor it in the loop only. this is here because sometimes the last buffer of subtitle doesnt exceed recom_char_count so parts count remains 0 and then duration calculation below throws division by zero error.
parts.append(buffer)
buffer = ""
bufferLength = 0


part_duration = duration / len(parts)

partindex = 0
for part in parts: # here, "part" is the text
part_start_time = start_time + part_duration * partindex
if part_start_time != 0:
part_start_time -= beforeTimeMargin
part_end_time = start_time + part_duration * (partindex + 1) + afterTimeMargin

writable = {
"text" : part,
"part_start_time" : part_start_time,
"part_end_time" : part_end_time
}

writables.append(writable)

partindex += 1

# TODO: find if there is a better way to make sure file is empty.
subtitle_file = open(subtitle_file_path, 'w', encoding="utf-8")
subtitle_file.write("")
subtitle_file.close()

subtitle_file = open(subtitle_file_path, 'a', encoding="utf-8")

writable_index = 1
for writable in writables:
subtitle_file.write(str(writable_index) + "\n")
subtitle_file.write(self.get_subtitle_time(writable["part_start_time"]) + " --> " + self.get_subtitle_time(writable["part_end_time"]) + "\n")
subtitle_file.write(writable["text"] + "\n\n")
writable_index += 1

subtitle_file.close()

def tts(
self,
text: str = "",
Expand All @@ -257,6 +345,11 @@ def tts(
style_text=None,
reference_wav=None,
reference_speaker_name=None,
generate_subtitles : bool = False,
subtitle_file_path : str = "output.srt",
subtitle_batch_time : int = 3, #basically text of how much time should be shown in one line (couldn't find a better name)
beforeTimeMargin = 0,
afterTimeMargin = 0,
**kwargs,
) -> List[int]:
"""🐸 TTS magic. Run all the models and generate speech.
Expand All @@ -275,6 +368,8 @@ def tts(
"""
start_time = time.time()
wavs = []
subtitles = []
time_accounted_for = 0

if not text and not reference_wav:
raise ValueError(
Expand Down Expand Up @@ -359,6 +454,7 @@ def tts(
use_gl = self.vocoder_model is None

if not reference_wav: # not voice conversion
sen_index = 0 #this is for development only, to see how many sentences have been processed because I am testing with very large piece of texts.
for sen in sens:
if hasattr(self.tts_model, "synthesize"):
sp_name = "random" if speaker_name is None else speaker_name
Expand Down Expand Up @@ -414,8 +510,26 @@ def tts(
if "do_trim_silence" in self.tts_config.audio and self.tts_config.audio["do_trim_silence"]:
waveform = trim_silence(waveform, self.tts_model.ap)

if generate_subtitles:
wave_time = len(waveform) / self.tts_config.audio["sample_rate"]
subtitle = {
"text" : sen,
"start_time" : time_accounted_for,
"end_time" : time_accounted_for + wave_time
}
subtitles.append(subtitle)


wavs += list(waveform)
wavs += [0] * 10000

if generate_subtitles:
time_accounted_for += wave_time
time_accounted_for += 10000 / self.tts_config.audio["sample_rate"]

sen_index += 1
print(f"{sen_index} sentences computed out of {len(sens)} total sentences.") # i wanted to implement \r to keep console clean but sometimes the max decoder steps message will also log which will break it, that's why i didnt implement it. and i dont know how to deal with \r and logging in place of current line when another piece of code is logging too.

else:
# get the speaker embedding or speaker id for the reference wav file
reference_speaker_embedding = None
Expand Down Expand Up @@ -475,9 +589,15 @@ def tts(
waveform = waveform.numpy()
wavs = waveform.squeeze()

if generate_subtitles:
self.generate_subtitles(subtitles, beforeTimeMargin, afterTimeMargin, subtitle_batch_time, subtitle_file_path)
print("Subtitles generated")

# compute stats
process_time = time.time() - start_time
audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
print(f" > Processing time: {process_time}")
print(f" > Real-time factor: {process_time / audio_time}")
return wavs


1 change: 1 addition & 0 deletions subtitle_example/important.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The audio has some disturbances in the beginning part with small sentences and reading numbers so it stutters. But it is the ai logic which stutters and it has nothing to do with the subtitle code.
Binary file added subtitle_example/output.mp4
Binary file not shown.
Loading
Loading