123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607 |
- import sys
- from pathlib import Path
- from time import sleep
- from typing import List, Set
- from warnings import filterwarnings, warn
- import matplotlib.pyplot as plt
- import numpy as np
- import sounddevice as sd
- import soundfile as sf
- import umap
- from PyQt5.QtCore import Qt, QStringListModel
- from PyQt5.QtWidgets import *
- from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
- from encoder.inference import plot_embedding_as_heatmap
- from toolbox.utterance import Utterance
- filterwarnings("ignore")
- colormap = np.array([
- [0, 127, 70],
- [255, 0, 0],
- [255, 217, 38],
- [0, 135, 255],
- [165, 0, 165],
- [255, 167, 255],
- [97, 142, 151],
- [0, 255, 255],
- [255, 96, 38],
- [142, 76, 0],
- [33, 0, 127],
- [0, 0, 0],
- [183, 183, 183],
- [76, 255, 0],
- ], dtype=np.float) / 255
- default_text = \
- "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \
- "yourself.\nOnce its embedding has been created, you can synthesize any text written here.\n" \
- "The synthesizer expects to generate " \
- "outputs that are somewhere between 5 and 12 seconds.\nTo mark breaks, write a new line. " \
- "Each line will be treated separately.\nThen, they are joined together to make the final " \
- "spectrogram. Use the vocoder to generate audio.\nThe vocoder generates almost in constant " \
- "time, so it will be more time efficient for longer inputs like this one.\nOn the left you " \
- "have the embedding projections. Load or record more utterances to see them.\nIf you have " \
- "at least 2 or 3 utterances from a same speaker, a cluster should form.\nSynthesized " \
- "utterances are of the same color as the speaker whose voice was used, but they're " \
- "represented with a cross."
- class UI(QDialog):
- min_umap_points = 4
- max_log_lines = 5
- max_saved_utterances = 20
- def draw_utterance(self, utterance: Utterance, which):
- self.draw_spec(utterance.spec, which)
- self.draw_embed(utterance.embed, utterance.name, which)
- def draw_embed(self, embed, name, which):
- embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
- embed_ax.figure.suptitle("" if embed is None else name)
- ## Embedding
- # Clear the plot
- if len(embed_ax.images) > 0:
- embed_ax.images[0].colorbar.remove()
- embed_ax.clear()
- # Draw the embed
- if embed is not None:
- plot_embedding_as_heatmap(embed, embed_ax)
- embed_ax.set_title("embedding")
- embed_ax.set_aspect("equal", "datalim")
- embed_ax.set_xticks([])
- embed_ax.set_yticks([])
- embed_ax.figure.canvas.draw()
- def draw_spec(self, spec, which):
- _, spec_ax = self.current_ax if which == "current" else self.gen_ax
- ## Spectrogram
- # Draw the spectrogram
- spec_ax.clear()
- if spec is not None:
- spec_ax.imshow(spec, aspect="auto", interpolation="none")
- spec_ax.set_title("mel spectrogram")
- spec_ax.set_xticks([])
- spec_ax.set_yticks([])
- spec_ax.figure.canvas.draw()
- if which != "current":
- self.vocode_button.setDisabled(spec is None)
- def draw_umap_projections(self, utterances: Set[Utterance]):
- self.umap_ax.clear()
- speakers = np.unique([u.speaker_name for u in utterances])
- colors = {speaker_name: colormap[i] for i, speaker_name in enumerate(speakers)}
- embeds = [u.embed for u in utterances]
- # Display a message if there aren't enough points
- if len(utterances) < self.min_umap_points:
- self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" %
- (self.min_umap_points - len(utterances)),
- horizontalalignment='center', fontsize=15)
- self.umap_ax.set_title("")
- # Compute the projections
- else:
- if not self.umap_hot:
- self.log(
- "Drawing UMAP projections for the first time, this will take a few seconds.")
- self.umap_hot = True
- reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine")
- projections = reducer.fit_transform(embeds)
- speakers_done = set()
- for projection, utterance in zip(projections, utterances):
- color = colors[utterance.speaker_name]
- mark = "x" if "_gen_" in utterance.name else "o"
- label = None if utterance.speaker_name in speakers_done else utterance.speaker_name
- speakers_done.add(utterance.speaker_name)
- self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark,
- label=label)
- self.umap_ax.legend(prop={'size': 10})
- # Draw the plot
- self.umap_ax.set_aspect("equal", "datalim")
- self.umap_ax.set_xticks([])
- self.umap_ax.set_yticks([])
- self.umap_ax.figure.canvas.draw()
- def save_audio_file(self, wav, sample_rate):
- dialog = QFileDialog()
- dialog.setDefaultSuffix(".wav")
- fpath, _ = dialog.getSaveFileName(
- parent=self,
- caption="Select a path to save the audio file",
- filter="Audio Files (*.flac *.wav)"
- )
- if fpath:
- #Default format is wav
- if Path(fpath).suffix == "":
- fpath += ".wav"
- sf.write(fpath, wav, sample_rate)
- def setup_audio_devices(self, sample_rate):
- input_devices = []
- output_devices = []
- for device in sd.query_devices():
- # Check if valid input
- try:
- sd.check_input_settings(device=device["name"], samplerate=sample_rate)
- input_devices.append(device["name"])
- except:
- pass
- # Check if valid output
- try:
- sd.check_output_settings(device=device["name"], samplerate=sample_rate)
- output_devices.append(device["name"])
- except Exception as e:
- # Log a warning only if the device is not an input
- if not device["name"] in input_devices:
- warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e)))
- if len(input_devices) == 0:
- self.log("No audio input device detected. Recording may not work.")
- self.audio_in_device = None
- else:
- self.audio_in_device = input_devices[0]
- if len(output_devices) == 0:
- self.log("No supported output audio devices were found! Audio output may not work.")
- self.audio_out_devices_cb.addItems(["None"])
- self.audio_out_devices_cb.setDisabled(True)
- else:
- self.audio_out_devices_cb.clear()
- self.audio_out_devices_cb.addItems(output_devices)
- self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device)
- self.set_audio_device()
- def set_audio_device(self):
- output_device = self.audio_out_devices_cb.currentText()
- if output_device == "None":
- output_device = None
- # If None, sounddevice queries portaudio
- sd.default.device = (self.audio_in_device, output_device)
- def play(self, wav, sample_rate):
- try:
- sd.stop()
- sd.play(wav, sample_rate)
- except Exception as e:
- print(e)
- self.log("Error in audio playback. Try selecting a different audio output device.")
- self.log("Your device must be connected before you start the toolbox.")
- def stop(self):
- sd.stop()
- def record_one(self, sample_rate, duration):
- self.record_button.setText("Recording...")
- self.record_button.setDisabled(True)
- self.log("Recording %d seconds of audio" % duration)
- sd.stop()
- try:
- wav = sd.rec(duration * sample_rate, sample_rate, 1)
- except Exception as e:
- print(e)
- self.log("Could not record anything. Is your recording device enabled?")
- self.log("Your device must be connected before you start the toolbox.")
- return None
- for i in np.arange(0, duration, 0.1):
- self.set_loading(i, duration)
- sleep(0.1)
- self.set_loading(duration, duration)
- sd.wait()
- self.log("Done recording.")
- self.record_button.setText("Record")
- self.record_button.setDisabled(False)
- return wav.squeeze()
- @property
- def current_dataset_name(self):
- return self.dataset_box.currentText()
- @property
- def current_speaker_name(self):
- return self.speaker_box.currentText()
- @property
- def current_utterance_name(self):
- return self.utterance_box.currentText()
- def browse_file(self):
- fpath = QFileDialog().getOpenFileName(
- parent=self,
- caption="Select an audio file",
- filter="Audio Files (*.mp3 *.flac *.wav *.m4a)"
- )
- return Path(fpath[0]) if fpath[0] != "" else ""
- @staticmethod
- def repopulate_box(box, items, random=False):
- """
- Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join
- data to the items
- """
- box.blockSignals(True)
- box.clear()
- for item in items:
- item = list(item) if isinstance(item, tuple) else [item]
- box.addItem(str(item[0]), *item[1:])
- if len(items) > 0:
- box.setCurrentIndex(np.random.randint(len(items)) if random else 0)
- box.setDisabled(len(items) == 0)
- box.blockSignals(False)
- def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int,
- random=True):
- # Select a random dataset
- if level <= 0:
- if datasets_root is not None:
- datasets = [datasets_root.joinpath(d) for d in recognized_datasets]
- datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
- self.browser_load_button.setDisabled(len(datasets) == 0)
- if datasets_root is None or len(datasets) == 0:
- msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
- if datasets_root is None else "o not have any of the recognized datasets" \
- " in %s" % datasets_root)
- self.log(msg)
- msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
- "can still use the toolbox by recording samples yourself." % \
- ("\n\t".join(recognized_datasets))
- print(msg, file=sys.stderr)
- self.random_utterance_button.setDisabled(True)
- self.random_speaker_button.setDisabled(True)
- self.random_dataset_button.setDisabled(True)
- self.utterance_box.setDisabled(True)
- self.speaker_box.setDisabled(True)
- self.dataset_box.setDisabled(True)
- self.browser_load_button.setDisabled(True)
- self.auto_next_checkbox.setDisabled(True)
- return
- self.repopulate_box(self.dataset_box, datasets, random)
- # Select a random speaker
- if level <= 1:
- speakers_root = datasets_root.joinpath(self.current_dataset_name)
- speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()]
- self.repopulate_box(self.speaker_box, speaker_names, random)
- # Select a random utterance
- if level <= 2:
- utterances_root = datasets_root.joinpath(
- self.current_dataset_name,
- self.current_speaker_name
- )
- utterances = []
- for extension in ['mp3', 'flac', 'wav', 'm4a']:
- utterances.extend(Path(utterances_root).glob("**/*.%s" % extension))
- utterances = [fpath.relative_to(utterances_root) for fpath in utterances]
- self.repopulate_box(self.utterance_box, utterances, random)
- def browser_select_next(self):
- index = (self.utterance_box.currentIndex() + 1) % len(self.utterance_box)
- self.utterance_box.setCurrentIndex(index)
- @property
- def current_encoder_fpath(self):
- return self.encoder_box.itemData(self.encoder_box.currentIndex())
- @property
- def current_synthesizer_fpath(self):
- return self.synthesizer_box.itemData(self.synthesizer_box.currentIndex())
- @property
- def current_vocoder_fpath(self):
- return self.vocoder_box.itemData(self.vocoder_box.currentIndex())
- def populate_models(self, models_dir: Path):
- # Encoder
- encoder_fpaths = list(models_dir.glob("*/encoder.pt"))
- if len(encoder_fpaths) == 0:
- raise Exception("No encoder models found in %s" % models_dir)
- self.repopulate_box(self.encoder_box, [(f.parent.name, f) for f in encoder_fpaths])
- # Synthesizer
- synthesizer_fpaths = list(models_dir.glob("*/synthesizer.pt"))
- if len(synthesizer_fpaths) == 0:
- raise Exception("No synthesizer models found in %s" % models_dir)
- self.repopulate_box(self.synthesizer_box, [(f.parent.name, f) for f in synthesizer_fpaths])
- # Vocoder
- vocoder_fpaths = list(models_dir.glob("*/vocoder.pt"))
- vocoder_items = [(f.parent.name, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)]
- self.repopulate_box(self.vocoder_box, vocoder_items)
- @property
- def selected_utterance(self):
- return self.utterance_history.itemData(self.utterance_history.currentIndex())
- def register_utterance(self, utterance: Utterance):
- self.utterance_history.blockSignals(True)
- self.utterance_history.insertItem(0, utterance.name, utterance)
- self.utterance_history.setCurrentIndex(0)
- self.utterance_history.blockSignals(False)
- if len(self.utterance_history) > self.max_saved_utterances:
- self.utterance_history.removeItem(self.max_saved_utterances)
- self.play_button.setDisabled(False)
- self.generate_button.setDisabled(False)
- self.synthesize_button.setDisabled(False)
- def log(self, line, mode="newline"):
- if mode == "newline":
- self.logs.append(line)
- if len(self.logs) > self.max_log_lines:
- del self.logs[0]
- elif mode == "append":
- self.logs[-1] += line
- elif mode == "overwrite":
- self.logs[-1] = line
- log_text = '\n'.join(self.logs)
- self.log_window.setText(log_text)
- self.app.processEvents()
- def set_loading(self, value, maximum=1):
- self.loading_bar.setValue(value * 100)
- self.loading_bar.setMaximum(maximum * 100)
- self.loading_bar.setTextVisible(value != 0)
- self.app.processEvents()
- def populate_gen_options(self, seed, trim_silences):
- if seed is not None:
- self.random_seed_checkbox.setChecked(True)
- self.seed_textbox.setText(str(seed))
- self.seed_textbox.setEnabled(True)
- else:
- self.random_seed_checkbox.setChecked(False)
- self.seed_textbox.setText(str(0))
- self.seed_textbox.setEnabled(False)
- if not trim_silences:
- self.trim_silences_checkbox.setChecked(False)
- self.trim_silences_checkbox.setDisabled(True)
- def update_seed_textbox(self):
- if self.random_seed_checkbox.isChecked():
- self.seed_textbox.setEnabled(True)
- else:
- self.seed_textbox.setEnabled(False)
- def reset_interface(self):
- self.draw_embed(None, None, "current")
- self.draw_embed(None, None, "generated")
- self.draw_spec(None, "current")
- self.draw_spec(None, "generated")
- self.draw_umap_projections(set())
- self.set_loading(0)
- self.play_button.setDisabled(True)
- self.generate_button.setDisabled(True)
- self.synthesize_button.setDisabled(True)
- self.vocode_button.setDisabled(True)
- self.replay_wav_button.setDisabled(True)
- self.export_wav_button.setDisabled(True)
- [self.log("") for _ in range(self.max_log_lines)]
- def __init__(self):
- ## Initialize the application
- self.app = QApplication(sys.argv)
- super().__init__(None)
- self.setWindowTitle("SV2TTS toolbox")
- ## Main layouts
- # Root
- root_layout = QGridLayout()
- self.setLayout(root_layout)
- # Browser
- browser_layout = QGridLayout()
- root_layout.addLayout(browser_layout, 0, 0, 1, 2)
- # Generation
- gen_layout = QVBoxLayout()
- root_layout.addLayout(gen_layout, 0, 2, 1, 2)
- # Projections
- self.projections_layout = QVBoxLayout()
- root_layout.addLayout(self.projections_layout, 1, 0, 1, 1)
- # Visualizations
- vis_layout = QVBoxLayout()
- root_layout.addLayout(vis_layout, 1, 1, 1, 3)
- ## Projections
- # UMap
- fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0")
- fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98)
- self.projections_layout.addWidget(FigureCanvas(fig))
- self.umap_hot = False
- self.clear_button = QPushButton("Clear")
- self.projections_layout.addWidget(self.clear_button)
- ## Browser
- # Dataset, speaker and utterance selection
- i = 0
- self.dataset_box = QComboBox()
- browser_layout.addWidget(QLabel("<b>Dataset</b>"), i, 0)
- browser_layout.addWidget(self.dataset_box, i + 1, 0)
- self.speaker_box = QComboBox()
- browser_layout.addWidget(QLabel("<b>Speaker</b>"), i, 1)
- browser_layout.addWidget(self.speaker_box, i + 1, 1)
- self.utterance_box = QComboBox()
- browser_layout.addWidget(QLabel("<b>Utterance</b>"), i, 2)
- browser_layout.addWidget(self.utterance_box, i + 1, 2)
- self.browser_load_button = QPushButton("Load")
- browser_layout.addWidget(self.browser_load_button, i + 1, 3)
- i += 2
- # Random buttons
- self.random_dataset_button = QPushButton("Random")
- browser_layout.addWidget(self.random_dataset_button, i, 0)
- self.random_speaker_button = QPushButton("Random")
- browser_layout.addWidget(self.random_speaker_button, i, 1)
- self.random_utterance_button = QPushButton("Random")
- browser_layout.addWidget(self.random_utterance_button, i, 2)
- self.auto_next_checkbox = QCheckBox("Auto select next")
- self.auto_next_checkbox.setChecked(True)
- browser_layout.addWidget(self.auto_next_checkbox, i, 3)
- i += 1
- # Utterance box
- browser_layout.addWidget(QLabel("<b>Use embedding from:</b>"), i, 0)
- self.utterance_history = QComboBox()
- browser_layout.addWidget(self.utterance_history, i, 1, 1, 3)
- i += 1
- # Random & next utterance buttons
- self.browser_browse_button = QPushButton("Browse")
- browser_layout.addWidget(self.browser_browse_button, i, 0)
- self.record_button = QPushButton("Record")
- browser_layout.addWidget(self.record_button, i, 1)
- self.play_button = QPushButton("Play")
- browser_layout.addWidget(self.play_button, i, 2)
- self.stop_button = QPushButton("Stop")
- browser_layout.addWidget(self.stop_button, i, 3)
- i += 1
- # Model and audio output selection
- self.encoder_box = QComboBox()
- browser_layout.addWidget(QLabel("<b>Encoder</b>"), i, 0)
- browser_layout.addWidget(self.encoder_box, i + 1, 0)
- self.synthesizer_box = QComboBox()
- browser_layout.addWidget(QLabel("<b>Synthesizer</b>"), i, 1)
- browser_layout.addWidget(self.synthesizer_box, i + 1, 1)
- self.vocoder_box = QComboBox()
- browser_layout.addWidget(QLabel("<b>Vocoder</b>"), i, 2)
- browser_layout.addWidget(self.vocoder_box, i + 1, 2)
- self.audio_out_devices_cb=QComboBox()
- browser_layout.addWidget(QLabel("<b>Audio Output</b>"), i, 3)
- browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 3)
- i += 2
- #Replay & Save Audio
- browser_layout.addWidget(QLabel("<b>Toolbox Output:</b>"), i, 0)
- self.waves_cb = QComboBox()
- self.waves_cb_model = QStringListModel()
- self.waves_cb.setModel(self.waves_cb_model)
- self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting")
- browser_layout.addWidget(self.waves_cb, i, 1)
- self.replay_wav_button = QPushButton("Replay")
- self.replay_wav_button.setToolTip("Replay last generated vocoder")
- browser_layout.addWidget(self.replay_wav_button, i, 2)
- self.export_wav_button = QPushButton("Export")
- self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
- browser_layout.addWidget(self.export_wav_button, i, 3)
- i += 1
- ## Embed & spectrograms
- vis_layout.addStretch()
- gridspec_kw = {"width_ratios": [1, 4]}
- fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
- gridspec_kw=gridspec_kw)
- fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
- vis_layout.addWidget(FigureCanvas(fig))
- fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
- gridspec_kw=gridspec_kw)
- fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
- vis_layout.addWidget(FigureCanvas(fig))
- for ax in self.current_ax.tolist() + self.gen_ax.tolist():
- ax.set_facecolor("#F0F0F0")
- for side in ["top", "right", "bottom", "left"]:
- ax.spines[side].set_visible(False)
- ## Generation
- self.text_prompt = QPlainTextEdit(default_text)
- gen_layout.addWidget(self.text_prompt, stretch=1)
- self.generate_button = QPushButton("Synthesize and vocode")
- gen_layout.addWidget(self.generate_button)
- layout = QHBoxLayout()
- self.synthesize_button = QPushButton("Synthesize only")
- layout.addWidget(self.synthesize_button)
- self.vocode_button = QPushButton("Vocode only")
- layout.addWidget(self.vocode_button)
- gen_layout.addLayout(layout)
- layout_seed = QGridLayout()
- self.random_seed_checkbox = QCheckBox("Random seed:")
- self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
- layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
- self.seed_textbox = QLineEdit()
- self.seed_textbox.setMaximumWidth(80)
- layout_seed.addWidget(self.seed_textbox, 0, 1)
- self.trim_silences_checkbox = QCheckBox("Enhance vocoder output")
- self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
- " This feature requires `webrtcvad` to be installed.")
- layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
- gen_layout.addLayout(layout_seed)
- self.loading_bar = QProgressBar()
- gen_layout.addWidget(self.loading_bar)
- self.log_window = QLabel()
- self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft)
- gen_layout.addWidget(self.log_window)
- self.logs = []
- gen_layout.addStretch()
- ## Set the size of the window and of the elements
- max_size = QDesktopWidget().availableGeometry(self).size() * 0.8
- self.resize(max_size)
- ## Finalize the display
- self.reset_interface()
- self.show()
- def start(self):
- self.app.exec_()
|