|
- import json
- import gradio as gr
- import os
- import shutil
- from tts_webui.bark.history_to_hash import history_to_hash
- from tts_webui.history_tab.save_photo import save_photo
- from tts_webui.history_tab.edit_metadata_ui import edit_metadata_ui
- from tts_webui.bark.get_audio_from_npz import get_audio_from_full_generation
- from tts_webui.bark.npz_tools import load_npz, save_npz
- from tts_webui.history_tab.get_wav_files import get_npz_files_voices
- from tts_webui.history_tab.main import _get_filename, _get_row_index
- from tts_webui.history_tab.open_folder import open_folder
- from tts_webui.tortoise.gr_reload_button import gr_reload_button
- def update_voices_tab():
- return gr.List(value=get_npz_files_voices())
- def voices_tab(directory="voices"):
- with gr.Tab(directory.capitalize()) as voices_tab, gr.Row(equal_height=False):
- with gr.Column():
- with gr.Accordion("Gallery Selector (Click to Open)", open=False):
- history_list_as_gallery = gr.Gallery(
- value=[],
- columns=4,
- object_fit="contain",
- height="auto",
- )
- gr.Button(value="Refresh").click(
- fn=lambda: gr.Gallery(
- value=[
- f"voices/{x}"
- for x in os.listdir("voices")
- if x.endswith(".png")
- ]
- ),
- outputs=[history_list_as_gallery],
- )
- with gr.Row():
- button_output = gr.Button(value=f"Open {directory} folder")
- reload_button = gr_reload_button()
- button_output.click(lambda: open_folder(directory))
- datatypes = ["date", "str", "str", "str", "str"]
- headers = [
- "Date and Time",
- directory.capitalize(),
- "When",
- "Hash",
- "Filename",
- ]
- voices_list = gr.Dataframe(
- value=get_npz_files_voices(),
- interactive=False,
- datatype=datatypes,
- col_count=len(datatypes),
- headers=headers,
- max_height=800,
- # elem_classes="file-list"
- )
- with gr.Column():
- audio = gr.Audio(visible=True, type="numpy", label="Fine prompt audio")
- voice_hash = gr.Textbox(label="Hash", value="", interactive=False)
- crop_voice_button = gr.Button(value="Crop voice")
- voice_file_name = gr.Textbox(
- label="Voice file name", value="", interactive=False
- )
- new_voice_file_name = gr.Textbox(label="New voice file name", value="")
- with gr.Row():
- rename_voice_button = gr.Button(value="Rename voice")
- delete_voice_button = gr.Button(value="Delete voice", variant="stop")
- gr.Markdown("""Use voice button is now only available in React UI""")
- metadata = gr.JSON(label="Metadata")
- metadata_input = edit_metadata_ui(voice_file_name, metadata)
- photo = gr.Image(label="Photo", type="pil", interactive=True)
- file_list = gr.Files(value=[], label="Files", interactive=False)
- photo.upload(
- fn=save_photo,
- inputs=[photo, voice_file_name],
- outputs=[photo],
- )
- def delete_voice(voice_file_name):
- os.remove(voice_file_name)
- return {
- delete_voice_button: gr.Button(value="Deleted"),
- voices_list: update_voices_tab(),
- }
- def rename_voice(voice_file_name_in, new_voice_file_name):
- shutil.move(voice_file_name_in, new_voice_file_name)
- png_file = voice_file_name_in.replace(".npz", ".png")
- if os.path.exists(png_file):
- shutil.move(png_file, new_voice_file_name.replace(".npz", ".png"))
- return {
- rename_voice_button: gr.Button(value="Renamed"),
- voices_list: update_voices_tab(),
- voice_file_name: gr.Textbox(value=new_voice_file_name),
- }
- def crop_voice(voice_file_name, audio_in):
- from bark.generation import COARSE_RATE_HZ, SEMANTIC_RATE_HZ, N_COARSE_CODEBOOKS
- crop_min, crop_max = audio_in.get("crop_min", 0), audio_in.get("crop_max", 100)
- full_generation = load_npz(voice_file_name)
- semantic_prompt = full_generation["semantic_prompt"]
- len_semantic_prompt = len(semantic_prompt)
- semantic_prompt = semantic_prompt[
- len_semantic_prompt * crop_min // 100 : len_semantic_prompt
- * crop_max
- // 100
- ]
- coarse_prompt = full_generation["coarse_prompt"]
- len_coarse_prompt = coarse_prompt.shape[-1]
- coarse_prompt = coarse_prompt[
- :, len_coarse_prompt * crop_min // 100 : len_coarse_prompt * crop_max // 100
- ]
- fine_prompt = full_generation["fine_prompt"]
- len_fine_prompt = fine_prompt.shape[-1]
- fine_prompt = fine_prompt[
- :, len_fine_prompt * crop_min // 100 : len_fine_prompt * crop_max // 100
- ]
- semantic_to_coarse_ratio = (
- COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
- )
- assert round(coarse_prompt.shape[-1] / len(semantic_prompt), 1) == round(
- semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1
- )
- voice_file_name_cropped = voice_file_name.replace(
- ".npz", f"_cropped_{crop_min}_{crop_max}.npz"
- )
- new_hash = history_to_hash(full_generation) # type: ignore
- new_meta = full_generation.get("metadata", {})
- new_meta["crop_min"] = crop_min
- new_meta["crop_max"] = crop_max
- new_meta["hash"] = new_hash
- save_npz(
- voice_file_name_cropped,
- {
- "semantic_prompt": semantic_prompt,
- "coarse_prompt": coarse_prompt,
- "fine_prompt": fine_prompt,
- },
- metadata=new_meta,
- )
- return select_filename(voice_file_name_cropped)
- rename_voice_button.click(
- fn=rename_voice,
- inputs=[voice_file_name, new_voice_file_name],
- outputs=[rename_voice_button, voices_list, voice_file_name],
- )
- delete_voice_button.click(
- fn=delete_voice,
- inputs=[voice_file_name],
- outputs=[delete_voice_button, voices_list],
- )
- def select_filename(filename_npz):
- full_generation = load_npz(filename_npz)
- resolved_photo = filename_npz.replace(".npz", ".png")
- if not os.path.exists(resolved_photo):
- resolved_photo = None
- return {
- voice_file_name: gr.Textbox(value=filename_npz),
- new_voice_file_name: gr.Textbox(value=filename_npz),
- delete_voice_button: gr.Button(value="Delete"),
- rename_voice_button: gr.Button(value="Rename"),
- audio: gr.Audio(value=get_audio_from_full_generation(full_generation)), # type: ignore
- metadata: gr.JSON(value=full_generation.get("metadata", {})),
- metadata_input: gr.Textbox(
- value=json.dumps(full_generation.get("metadata", {}), indent=2)
- ),
- photo: gr.Image(value=resolved_photo),
- voice_hash: gr.Textbox(value=history_to_hash(full_generation)), # type: ignore
- file_list: gr.Files(
- value=get_file_list(filename_npz, resolved_photo),
- label="Files",
- ),
- }
- def get_file_list(filename_npz, resolved_photo):
- if resolved_photo is None:
- return [filename_npz]
- return [filename_npz, resolved_photo]
- def select(_list_data, evt: gr.SelectData):
- filename_npz = _get_filename(_list_data, _get_row_index(evt))
- return select_filename(filename_npz)
- outputs = [
- voice_file_name,
- new_voice_file_name,
- delete_voice_button,
- rename_voice_button,
- audio,
- metadata,
- metadata_input,
- photo,
- voice_hash,
- file_list,
- ]
- crop_voice_button.click(
- fn=crop_voice,
- inputs=[voice_file_name, audio],
- outputs=outputs,
- preprocess=False,
- ).then(
- fn=update_voices_tab,
- outputs=[voices_list],
- )
- reload_button.click(fn=update_voices_tab, outputs=[voices_list])
- voices_list.select(
- fn=select, inputs=[voices_list], outputs=outputs, preprocess=False
- )
- def select_gallery(_list_data, evt: gr.SelectData):
- def get_gallery_file_selection(_gallery_data, evt: gr.SelectData):
- selected_image = _gallery_data[evt.index]
- image_path = selected_image["name"]
- import os
- image_name = os.path.basename(image_path)
- return image_name.replace(".png", "")
- filename_base = get_gallery_file_selection(_list_data, evt)
- return select_filename(f"voices/{filename_base}.npz")
- history_list_as_gallery.select(
- fn=select_gallery, inputs=[history_list_as_gallery], outputs=outputs
- )
- voices_tab.select(fn=update_voices_tab, outputs=[voices_list])
|