123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- import torchaudio
- import torch
- import gradio as gr
- import numpy as np
- from encodec.utils import convert_audio
- from bark.generation import load_codec_model
- from encodec.model import EncodecModel
- from tts_webui.bark.history_to_hash import history_to_hash
- from tts_webui.bark.npz_tools import save_npz
- from tts_webui.bark.FullGeneration import FullGeneration
- from tts_webui.utils.date import get_date_string
- from tts_webui.bark.get_audio_from_npz import get_audio_from_full_generation
- from typing import TYPE_CHECKING
- if TYPE_CHECKING:
- from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
- from bark_hubert_quantizer.customtokenizer import CustomTokenizer
- hubert_model = None
- def _load_hubert_model(device):
- from bark_hubert_quantizer.hubert_manager import HuBERTManager
- from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
- hubert_path = HuBERTManager.make_sure_hubert_installed()
- global hubert_model
- if hubert_model is None:
- hubert_model = CustomHubert(
- checkpoint_path=hubert_path,
- device=device,
- )
- return hubert_model
- def _get_semantic_vectors(hubert_model: "CustomHubert", path_to_wav: str, device):
- # This is where you load your wav, with soundfile or torchaudio for example
- wav, sr = torchaudio.load(path_to_wav)
- if wav.shape[0] == 2: # Stereo to mono if needed
- wav = wav.mean(0, keepdim=True)
- wav = wav.to(device)
- return hubert_model.forward(wav, input_sample_hz=sr)
- def get_semantic_vectors(path_to_wav: str, device):
- hubert_model = _load_hubert_model(device)
- return _get_semantic_vectors(hubert_model, path_to_wav, device)
- tokenizer = None
- def _load_tokenizer(
- model: str = "quantifier_hubert_base_ls960_14.pth",
- repo: str = "GitMylo/bark-voice-cloning",
- force_reload: bool = False,
- device="cpu",
- ) -> "CustomTokenizer":
- from bark_hubert_quantizer.customtokenizer import CustomTokenizer
- from bark_hubert_quantizer.hubert_manager import HuBERTManager
- tokenizer_path = HuBERTManager.make_sure_tokenizer_installed(
- model=model,
- repo=repo,
- local_file=model,
- )
- global tokenizer
- if tokenizer is None or force_reload:
- tokenizer = CustomTokenizer.load_from_checkpoint(
- # "data/models/hubert/tokenizer.pth"
- tokenizer_path,
- map_location=device,
- )
- tokenizer.load_state_dict(torch.load(tokenizer_path, map_location=device))
- return tokenizer
- def get_semantic_tokens(semantic_vectors: torch.Tensor, device):
- tokenizer = _load_tokenizer(device=device)
- return tokenizer.get_token(semantic_vectors)
- def get_semantic_prompt(path_to_wav: str, device):
- semantic_vectors = get_semantic_vectors(path_to_wav, device)
- return get_semantic_tokens(semantic_vectors, device).cpu().numpy()
- def get_prompts(path_to_wav: str, use_gpu: bool):
- device = "cuda" if use_gpu else "cpu"
- semantic_prompt = get_semantic_prompt(path_to_wav, device)
- fine_prompt, coarse_prompt = get_encodec_prompts(path_to_wav, use_gpu)
- return FullGeneration(
- semantic_prompt=semantic_prompt,
- coarse_prompt=coarse_prompt,
- fine_prompt=fine_prompt,
- )
- def get_encodec_prompts(path_to_wav: str, use_gpu=True):
- device = "cuda" if use_gpu else "cpu"
- model: EncodecModel = load_codec_model(use_gpu=use_gpu)
- wav, sr = torchaudio.load(path_to_wav)
- wav = convert_audio(wav, sr, model.sample_rate, model.channels)
- wav = wav.unsqueeze(0).to(device)
- model.to(device)
- # Extract discrete codes from EnCodec
- with torch.no_grad():
- encoded_frames = model.encode(wav)
- fine_prompt: np.ndarray = (
- torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
- .squeeze()
- .cpu()
- .numpy()
- )
- coarse_prompt = fine_prompt[:2, :]
- return fine_prompt, coarse_prompt
- def save_cloned_voice(full_generation: FullGeneration):
- voice_name = f"voice_from_audio_{history_to_hash(full_generation)}"
- filename = f"voices/{voice_name}.npz"
- date = get_date_string()
- metadata = generate_cloned_voice_metadata(full_generation, date)
- save_npz(filename, full_generation, metadata)
- return filename
- def generate_cloned_voice_metadata(full_generation, date):
- return {
- "_version": "0.0.1",
- "_hash_version": "0.0.2",
- "_type": "bark",
- "hash": history_to_hash(full_generation),
- "date": date,
- }
- def tab_voice_clone():
- with gr.Tab("Bark Voice Clone"), gr.Row(equal_height=False):
- with gr.Column():
- gr.Markdown(
- """
- Unethical use of this technology is prohibited.
- This demo is based on https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer repository.
- Information from the original repository (https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer?tab=readme-ov-file#voices-cloned-arent-very-convincing-why-are-other-peoples-cloned-voices-better-than-mine)
- ## Voices cloned aren't very convincing, why are other people's cloned voices better than mine?
- Make sure these things are **NOT** in your voice input: (in no particular order)
- * Noise (You can use a noise remover before)
- * Music (There are also music remover tools) (Unless you want music in the background)
- * A cut-off at the end (This will cause it to try and continue on the generation)
- * Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
- What makes for good prompt audio? (in no particular order)
- * Clearly spoken
- * No weird background noises
- * Only one speaker
- * Audio which ends after a sentence ends
- * Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
- * Around 10 seconds of data
- """
- )
- with gr.Column():
- tokenizer_dropdown = gr.Dropdown(
- label="Tokenizer",
- choices=[
- "quantifier_hubert_base_ls960.pth @ GitMylo/bark-voice-cloning",
- "quantifier_hubert_base_ls960_14.pth @ GitMylo/bark-voice-cloning",
- "quantifier_V1_hubert_base_ls960_23.pth @ GitMylo/bark-voice-cloning",
- "polish-HuBERT-quantizer_8_epoch.pth @ Hobis/bark-voice-cloning-polish-HuBERT-quantizer",
- "german-HuBERT-quantizer_14_epoch.pth @ CountFloyd/bark-voice-cloning-german-HuBERT-quantizer",
- "es_tokenizer.pth @ Lancer1408/bark-es-tokenizer",
- "portuguese-HuBERT-quantizer_24_epoch.pth @ MadVoyager/bark-voice-cloning-portuguese-HuBERT-quantizer",
- "turkish_model_epoch_14.pth @ egeadam/bark-voice-cloning-turkish-HuBERT-quantizer",
- "japanese-HuBERT-quantizer_24_epoch.pth @ junwchina/bark-voice-cloning-japanese-HuBERT-quantizer",
- "it_tokenizer.pth @ gpwr/bark-it-tokenizer",
- ],
- value="quantifier_hubert_base_ls960_14.pth @ GitMylo/bark-voice-cloning",
- allow_custom_value=True,
- interactive=True,
- )
- file_input = gr.Audio(
- label="Input Audio",
- type="filepath",
- sources="upload",
- interactive=True,
- )
- with gr.Row():
- use_gpu_checkbox = gr.Checkbox(label="Use GPU", value=True)
- clear_models_button = gr.Button(
- "Clear models",
- variant="secondary",
- )
- def clear_models():
- global hubert_model
- global tokenizer
- hubert_model = None
- tokenizer = None
- torch.cuda.empty_cache()
- return gr.Button(
- value="Models cleared",
- )
- clear_models_button.click(
- fn=clear_models,
- outputs=[clear_models_button],
- )
- generate_voice_button = gr.Button(value="Generate Voice", variant="primary")
- def load_tokenizer(tokenizer_and_repo: str, use_gpu: bool):
- tokenizer, repo = tokenizer_and_repo.split(" @ ")
- device = "cuda" if use_gpu else "cpu"
- _load_tokenizer(
- model=tokenizer,
- repo=repo,
- force_reload=True,
- device=device,
- )
- return tokenizer_and_repo
- tokenizer_dropdown.change(
- load_tokenizer,
- inputs=[tokenizer_dropdown, use_gpu_checkbox],
- outputs=[tokenizer_dropdown],
- api_name="bark_voice_tokenizer_load",
- )
- gr.Markdown("Generated voice:")
- voice_file_name = gr.Textbox(
- label="Voice file name", value="", interactive=False
- )
- audio_preview = gr.Audio(label="Encodec audio preview")
- gr.Markdown("Use as history button is now only available in React UI")
- def generate_voice(wav_file: str, use_gpu: bool):
- full_generation = get_prompts(wav_file, use_gpu)
- filename = save_cloned_voice(full_generation)
- return filename, get_audio_from_full_generation(full_generation)
- generate_voice_button.click(
- fn=generate_voice,
- inputs=[file_input, use_gpu_checkbox],
- # inputs=[file_input, use_gpu_checkbox, tokenizer_dropdown],
- outputs=[voice_file_name, audio_preview],
- preprocess=True,
- api_name="bark_voice_generate",
- )
- if __name__ == "__main__":
- with gr.Blocks() as demo:
- tab_voice_clone()
- demo.launch()
|