github
/
tts-generation-webui
mirror of https://github.com/rsxdalv/tts-generation-webui


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
							from typing import Dict, Optional, Union

from bark import text_to_semantic, semantic_to_waveform
from bark.generation import SEMANTIC_RATE_HZ


def custom_generate_audio(
    text: str,
    burn_in_prompt: Optional[str] = None,
    history_prompt: Optional[Union[Dict, str]] = None,
    history_prompt_semantic: Optional[Union[Dict, str]] = None,
    text_temp: float = 0.7,
    waveform_temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
    max_length=None,
    **kwargs,
):
    """Generate audio array from input text.

    Args:
        text: text to be turned into audio
        history_prompt: history choice for audio cloning
        text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        output_full: return full generation to be used as a history prompt

    Returns:
        numpy audio array at sample frequency 24khz
    """
    history_prompt_semantic = history_prompt_semantic or history_prompt
    if burn_in_prompt is not None and len(burn_in_prompt) > 0:
        burn_in_prompt_semantic = text_to_semantic(
            burn_in_prompt,
            history_prompt=history_prompt_semantic,
            temp=text_temp,
            silent=silent,
        )
        history_prompt_semantic = {
            "coarse_prompt": None,
            "fine_prompt": None,
            "semantic_prompt": burn_in_prompt_semantic,
        }

    semantic_tokens = text_to_semantic(
        text,
        history_prompt=history_prompt_semantic,
        temp=text_temp,
        silent=silent,
    )

    out = semantic_to_waveform(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=waveform_temp,
        silent=silent,
        output_full=output_full,
        max_gen_duration_s=max_length,
    )
    if output_full:
        full_generation, audio_arr = out
        if max_length is not None:
            semantic_tokens = semantic_tokens[: int(max_length * SEMANTIC_RATE_HZ)]
            full_generation["semantic_prompt"] = semantic_tokens
        return full_generation, audio_arr
    else:
        audio_arr = out
    return audio_arr