github
/
tts-generation-webui
mirror of https://github.com/rsxdalv/tts-generation-webui


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
							import json
import gradio as gr
import os
import shutil


from tts_webui.bark.history_to_hash import history_to_hash
from tts_webui.history_tab.save_photo import save_photo
from tts_webui.history_tab.edit_metadata_ui import edit_metadata_ui
from tts_webui.bark.get_audio_from_npz import get_audio_from_full_generation
from tts_webui.bark.npz_tools import load_npz, save_npz
from tts_webui.history_tab.get_wav_files import get_npz_files_voices
from tts_webui.history_tab.main import _get_filename, _get_row_index
from tts_webui.history_tab.open_folder import open_folder
from tts_webui.tortoise.gr_reload_button import gr_reload_button


def update_voices_tab():
    return gr.List(value=get_npz_files_voices())


def voices_tab(directory="voices"):
    with gr.Tab(directory.capitalize()) as voices_tab, gr.Row(equal_height=False):
        with gr.Column():
            with gr.Accordion("Gallery Selector (Click to Open)", open=False):
                history_list_as_gallery = gr.Gallery(
                    value=[],
                    columns=4,
                    object_fit="contain",
                    height="auto",
                )
                gr.Button(value="Refresh").click(
                    fn=lambda: gr.Gallery(
                        value=[
                            f"voices/{x}"
                            for x in os.listdir("voices")
                            if x.endswith(".png")
                        ]
                    ),
                    outputs=[history_list_as_gallery],
                )
            with gr.Row():
                button_output = gr.Button(value=f"Open {directory} folder")

                reload_button = gr_reload_button()
            button_output.click(lambda: open_folder(directory))

            datatypes = ["date", "str", "str", "str", "str"]
            headers = [
                "Date and Time",
                directory.capitalize(),
                "When",
                "Hash",
                "Filename",
            ]

            voices_list = gr.Dataframe(
                value=get_npz_files_voices(),
                interactive=False,
                datatype=datatypes,
                col_count=len(datatypes),
                headers=headers,
                max_height=800,
                #  elem_classes="file-list"
            )
        with gr.Column():
            audio = gr.Audio(visible=True, type="numpy", label="Fine prompt audio")
            voice_hash = gr.Textbox(label="Hash", value="", interactive=False)
            crop_voice_button = gr.Button(value="Crop voice")
            voice_file_name = gr.Textbox(
                label="Voice file name", value="", interactive=False
            )
            new_voice_file_name = gr.Textbox(label="New voice file name", value="")

            with gr.Row():
                rename_voice_button = gr.Button(value="Rename voice")
                delete_voice_button = gr.Button(value="Delete voice", variant="stop")
                gr.Markdown("""Use voice button is now only available in React UI""")

            metadata = gr.JSON(label="Metadata")
            metadata_input = edit_metadata_ui(voice_file_name, metadata)

            photo = gr.Image(label="Photo", type="pil", interactive=True)
            file_list = gr.Files(value=[], label="Files", interactive=False)

    photo.upload(
        fn=save_photo,
        inputs=[photo, voice_file_name],
        outputs=[photo],
    )

    def delete_voice(voice_file_name):
        os.remove(voice_file_name)
        return {
            delete_voice_button: gr.Button(value="Deleted"),
            voices_list: update_voices_tab(),
        }

    def rename_voice(voice_file_name_in, new_voice_file_name):
        shutil.move(voice_file_name_in, new_voice_file_name)
        png_file = voice_file_name_in.replace(".npz", ".png")
        if os.path.exists(png_file):
            shutil.move(png_file, new_voice_file_name.replace(".npz", ".png"))
        return {
            rename_voice_button: gr.Button(value="Renamed"),
            voices_list: update_voices_tab(),
            voice_file_name: gr.Textbox(value=new_voice_file_name),
        }

    def crop_voice(voice_file_name, audio_in):
        from bark.generation import COARSE_RATE_HZ, SEMANTIC_RATE_HZ, N_COARSE_CODEBOOKS

        crop_min, crop_max = audio_in.get("crop_min", 0), audio_in.get("crop_max", 100)

        full_generation = load_npz(voice_file_name)

        semantic_prompt = full_generation["semantic_prompt"]
        len_semantic_prompt = len(semantic_prompt)
        semantic_prompt = semantic_prompt[
            len_semantic_prompt * crop_min // 100 : len_semantic_prompt
            * crop_max
            // 100
        ]

        coarse_prompt = full_generation["coarse_prompt"]
        len_coarse_prompt = coarse_prompt.shape[-1]

        coarse_prompt = coarse_prompt[
            :, len_coarse_prompt * crop_min // 100 : len_coarse_prompt * crop_max // 100
        ]
        fine_prompt = full_generation["fine_prompt"]
        len_fine_prompt = fine_prompt.shape[-1]
        fine_prompt = fine_prompt[
            :, len_fine_prompt * crop_min // 100 : len_fine_prompt * crop_max // 100
        ]

        semantic_to_coarse_ratio = (
            COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
        )

        assert round(coarse_prompt.shape[-1] / len(semantic_prompt), 1) == round(
            semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1
        )

        voice_file_name_cropped = voice_file_name.replace(
            ".npz", f"_cropped_{crop_min}_{crop_max}.npz"
        )

        new_hash = history_to_hash(full_generation)  # type: ignore
        new_meta = full_generation.get("metadata", {})
        new_meta["crop_min"] = crop_min
        new_meta["crop_max"] = crop_max
        new_meta["hash"] = new_hash

        save_npz(
            voice_file_name_cropped,
            {
                "semantic_prompt": semantic_prompt,
                "coarse_prompt": coarse_prompt,
                "fine_prompt": fine_prompt,
            },
            metadata=new_meta,
        )

        return select_filename(voice_file_name_cropped)

    rename_voice_button.click(
        fn=rename_voice,
        inputs=[voice_file_name, new_voice_file_name],
        outputs=[rename_voice_button, voices_list, voice_file_name],
    )
    delete_voice_button.click(
        fn=delete_voice,
        inputs=[voice_file_name],
        outputs=[delete_voice_button, voices_list],
    )

    def select_filename(filename_npz):
        full_generation = load_npz(filename_npz)
        resolved_photo = filename_npz.replace(".npz", ".png")
        if not os.path.exists(resolved_photo):
            resolved_photo = None

        return {
            voice_file_name: gr.Textbox(value=filename_npz),
            new_voice_file_name: gr.Textbox(value=filename_npz),
            delete_voice_button: gr.Button(value="Delete"),
            rename_voice_button: gr.Button(value="Rename"),
            audio: gr.Audio(value=get_audio_from_full_generation(full_generation)),  # type: ignore
            metadata: gr.JSON(value=full_generation.get("metadata", {})),
            metadata_input: gr.Textbox(
                value=json.dumps(full_generation.get("metadata", {}), indent=2)
            ),
            photo: gr.Image(value=resolved_photo),
            voice_hash: gr.Textbox(value=history_to_hash(full_generation)),  # type: ignore
            file_list: gr.Files(
                value=get_file_list(filename_npz, resolved_photo),
                label="Files",
            ),
        }

    def get_file_list(filename_npz, resolved_photo):
        if resolved_photo is None:
            return [filename_npz]

        return [filename_npz, resolved_photo]

    def select(_list_data, evt: gr.SelectData):
        filename_npz = _get_filename(_list_data, _get_row_index(evt))
        return select_filename(filename_npz)

    outputs = [
        voice_file_name,
        new_voice_file_name,
        delete_voice_button,
        rename_voice_button,
        audio,
        metadata,
        metadata_input,
        photo,
        voice_hash,
        file_list,
    ]

    crop_voice_button.click(
        fn=crop_voice,
        inputs=[voice_file_name, audio],
        outputs=outputs,
        preprocess=False,
    ).then(
        fn=update_voices_tab,
        outputs=[voices_list],
    )

    reload_button.click(fn=update_voices_tab, outputs=[voices_list])

    voices_list.select(
        fn=select, inputs=[voices_list], outputs=outputs, preprocess=False
    )

    def select_gallery(_list_data, evt: gr.SelectData):
        def get_gallery_file_selection(_gallery_data, evt: gr.SelectData):
            selected_image = _gallery_data[evt.index]
            image_path = selected_image["name"]
            import os

            image_name = os.path.basename(image_path)
            return image_name.replace(".png", "")

        filename_base = get_gallery_file_selection(_list_data, evt)
        return select_filename(f"voices/{filename_base}.npz")

    history_list_as_gallery.select(
        fn=select_gallery, inputs=[history_list_as_gallery], outputs=outputs
    )

    voices_tab.select(fn=update_voices_tab, outputs=[voices_list])