123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- import gradio as gr
- from tts_webui.decorators.gradio_dict_decorator import gradio_dict_decorator
- from tts_webui.utils.randomize_seed import randomize_seed_ui
- from tts_webui.utils.manage_model_state import manage_model_state
- from tts_webui.utils.list_dir_models import unload_model_button
- from tts_webui.decorators.decorator_apply_torch_seed import decorator_apply_torch_seed
- from tts_webui.decorators.decorator_log_generation import decorator_log_generation
- from tts_webui.decorators.decorator_save_metadata import decorator_save_metadata
- from tts_webui.decorators.decorator_save_wav import decorator_save_wav
- from tts_webui.decorators.decorator_add_base_filename import decorator_add_base_filename
- from tts_webui.decorators.decorator_add_date import decorator_add_date
- from tts_webui.decorators.decorator_add_model_type import decorator_add_model_type
- from tts_webui.decorators.log_function_time import log_function_time
- from tts_webui.extensions_loader.decorator_extensions import (
- decorator_extension_outer,
- decorator_extension_inner,
- )
- SAMPLE_RATE = 24_000
- @manage_model_state("style_tts2")
- def get_model(model_name=""):
- from styletts2.tts import StyleTTS2
- return StyleTTS2(
- model_checkpoint_path=None if model_name == "" else model_name,
- config_path=None,
- )
- def preview_phonemization(text):
- from nltk.tokenize import word_tokenize
- style_tts2_model = get_model("")
- text = text.strip()
- text = text.replace('"', "")
- phonemized_text = style_tts2_model.phoneme_converter.phonemize(text)
- ps = word_tokenize(phonemized_text)
- phoneme_string = " ".join(ps)
- return phoneme_string
- @decorator_extension_outer
- @decorator_apply_torch_seed
- @decorator_save_metadata
- @decorator_save_wav
- @decorator_add_model_type("style_tts2")
- @decorator_add_base_filename
- @decorator_add_date
- @decorator_log_generation
- @decorator_extension_inner
- @log_function_time
- def generate_audio_styleTTS2(
- text,
- alpha=0.3,
- beta=0.7,
- diffusion_steps=5,
- embedding_scale=1,
- **kwargs,
- ):
- model = get_model("")
- audio_array = model.inference(
- text=text,
- alpha=alpha,
- beta=beta,
- diffusion_steps=diffusion_steps,
- embedding_scale=embedding_scale,
- # target_voice_path=target_voice_path,
- # ref_s=None,
- # phonemize=True
- )
- return {"audio_out": (SAMPLE_RATE, audio_array)}
- def style_tts2_ui():
- gr.Markdown(
- """
- # StyleTTS2 Demo
- To use it, simply enter your text, and click "Generate".
- The model will generate audio from the text.
- It uses the [StyleTTS2](https://styletts2.github.io/) model via the [Python Package](https://github.com/sidharthrajaram/StyleTTS2).
- As a result, the phonemizer is a MIT licensed subsitute.
-
- Parameters:
- * text: Input text to turn into speech.
- * alpha: Determines timbre of speech, higher means style is more suitable to text than to the target voice.
- * beta: Determines prosody of speech, higher means style is more suitable to text than to the target voice.
- * diffusion_steps: The more the steps, the more diverse the samples are, with the cost of speed.
- * embedding_scale: Higher scale means style is more conditional to the input text and hence more emotional.
- """
- )
- text = gr.Textbox(label="Text", lines=3, placeholder="Enter text here...")
- preview_phonemized_text_button = gr.Button("Preview phonemized text")
- phonemized_text = gr.Textbox(
- label="Phonemized text (what the model will see)", interactive=False
- )
- preview_phonemized_text_button.click(
- fn=preview_phonemization,
- inputs=[text],
- outputs=[phonemized_text],
- api_name="style_tts2_phonemize",
- )
- with gr.Row():
- alpha = gr.Slider(label="Alpha (timbre)", minimum=-0.5, maximum=2.0, value=0.3)
- beta = gr.Slider(label="Beta (prosody)", minimum=-1.0, maximum=2.0, value=0.7)
- diffusion_steps = gr.Slider(
- label="Diffusion Steps (diversity)", minimum=1, maximum=20, value=5, step=1
- )
- embedding_scale = gr.Slider(
- label="Embedding Scale (emotion)", minimum=0.5, maximum=1.5, value=1.0
- )
- unload_model_button("style_tts2")
- with gr.Row():
- reset_params_button = gr.Button("Reset params")
- reset_params_button.click(
- fn=lambda: [
- gr.Slider(value=0.3),
- gr.Slider(value=0.7),
- gr.Slider(value=5),
- gr.Slider(value=1.0),
- ],
- outputs=[
- alpha,
- beta,
- diffusion_steps,
- embedding_scale,
- ],
- )
- generate_button = gr.Button("Generate", variant="primary")
- audio_out = gr.Audio(label="Generated audio")
- seed, randomize_seed_callback = randomize_seed_ui()
- input_dict = {
- text: "text",
- alpha: "alpha",
- beta: "beta",
- diffusion_steps: "diffusion_steps",
- embedding_scale: "embedding_scale",
- seed: "seed",
- }
- output_dict = {
- "audio_out": audio_out,
- "metadata": gr.JSON(label="Metadata", visible=False),
- "folder_root": gr.Textbox(label="Folder root", visible=False),
- }
- generate_button.click(
- **randomize_seed_callback,
- ).then(
- fn=gradio_dict_decorator(
- fn=generate_audio_styleTTS2,
- gradio_fn_input_dictionary=input_dict,
- outputs=output_dict,
- ),
- inputs={*input_dict},
- outputs=list(output_dict.values()),
- api_name="style_tts2_generate",
- )
- def style_tts2_tab():
- with gr.Tab("StyleTTS2"):
- style_tts2_ui()
- if __name__ == "__main__":
- if "demo" in locals():
- locals()["demo"].close()
- with gr.Blocks() as demo:
- style_tts2_tab()
- demo.launch()
|