1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- import platform
- from functools import partial
- from pathlib import Path
- import numpy as np
- import torch
- from torch.utils.data import DataLoader
- from tqdm import tqdm
- from synthesizer.hparams import hparams_debug_string
- from synthesizer.models.tacotron import Tacotron
- from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
- from synthesizer.utils import data_parallel_workaround
- from synthesizer.utils.symbols import symbols
- def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
- # This generates ground truth-aligned mels for vocoder training
- synth_dir = out_dir / "mels_gta"
- synth_dir.mkdir(exist_ok=True, parents=True)
- print(hparams_debug_string())
- # Check for GPU
- if torch.cuda.is_available():
- device = torch.device("cuda")
- if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
- raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
- else:
- device = torch.device("cpu")
- print("Synthesizer using device:", device)
- # Instantiate Tacotron model
- model = Tacotron(embed_dims=hparams.tts_embed_dims,
- num_chars=len(symbols),
- encoder_dims=hparams.tts_encoder_dims,
- decoder_dims=hparams.tts_decoder_dims,
- n_mels=hparams.num_mels,
- fft_bins=hparams.num_mels,
- postnet_dims=hparams.tts_postnet_dims,
- encoder_K=hparams.tts_encoder_K,
- lstm_dims=hparams.tts_lstm_dims,
- postnet_K=hparams.tts_postnet_K,
- num_highways=hparams.tts_num_highways,
- dropout=0., # Use zero dropout for gta mels
- stop_threshold=hparams.tts_stop_threshold,
- speaker_embedding_size=hparams.speaker_embedding_size).to(device)
- # Load the weights
- print("\nLoading weights at %s" % syn_model_fpath)
- model.load(syn_model_fpath)
- print("Tacotron weights loaded from step %d" % model.step)
- # Synthesize using same reduction factor as the model is currently trained
- r = np.int32(model.r)
- # Set model to eval mode (disable gradient and zoneout)
- model.eval()
- # Initialize the dataset
- metadata_fpath = in_dir.joinpath("train.txt")
- mel_dir = in_dir.joinpath("mels")
- embed_dir = in_dir.joinpath("embeds")
- dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
- collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
- data_loader = DataLoader(dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
- # Generate GTA mels
- meta_out_fpath = out_dir / "synthesized.txt"
- with meta_out_fpath.open("w") as file:
- for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
- texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)
- # Parallelize model onto GPUS using workaround due to python bug
- if device.type == "cuda" and torch.cuda.device_count() > 1:
- _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
- else:
- _, mels_out, _, _ = model(texts, mels, embeds)
- for j, k in enumerate(idx):
- # Note: outputs mel-spectrogram files and target ones have same names, just different folders
- mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
- mel_out = mels_out[j].detach().cpu().numpy().T
- # Use the length of the ground truth mel to remove padding from the generated mels
- mel_out = mel_out[:int(dataset.metadata[k][4])]
- # Write the spectrogram to disk
- np.save(mel_filename, mel_out, allow_pickle=False)
- # Write metadata into the synthesized file
- file.write("|".join(dataset.metadata[k]))
|