synthesize.py 3.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import platform
  2. from functools import partial
  3. from pathlib import Path
  4. import numpy as np
  5. import torch
  6. from torch.utils.data import DataLoader
  7. from tqdm import tqdm
  8. from synthesizer.hparams import hparams_debug_string
  9. from synthesizer.models.tacotron import Tacotron
  10. from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
  11. from synthesizer.utils import data_parallel_workaround
  12. from synthesizer.utils.symbols import symbols
  13. def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
  14. # This generates ground truth-aligned mels for vocoder training
  15. synth_dir = out_dir / "mels_gta"
  16. synth_dir.mkdir(exist_ok=True, parents=True)
  17. print(hparams_debug_string())
  18. # Check for GPU
  19. if torch.cuda.is_available():
  20. device = torch.device("cuda")
  21. if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
  22. raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
  23. else:
  24. device = torch.device("cpu")
  25. print("Synthesizer using device:", device)
  26. # Instantiate Tacotron model
  27. model = Tacotron(embed_dims=hparams.tts_embed_dims,
  28. num_chars=len(symbols),
  29. encoder_dims=hparams.tts_encoder_dims,
  30. decoder_dims=hparams.tts_decoder_dims,
  31. n_mels=hparams.num_mels,
  32. fft_bins=hparams.num_mels,
  33. postnet_dims=hparams.tts_postnet_dims,
  34. encoder_K=hparams.tts_encoder_K,
  35. lstm_dims=hparams.tts_lstm_dims,
  36. postnet_K=hparams.tts_postnet_K,
  37. num_highways=hparams.tts_num_highways,
  38. dropout=0., # Use zero dropout for gta mels
  39. stop_threshold=hparams.tts_stop_threshold,
  40. speaker_embedding_size=hparams.speaker_embedding_size).to(device)
  41. # Load the weights
  42. print("\nLoading weights at %s" % syn_model_fpath)
  43. model.load(syn_model_fpath)
  44. print("Tacotron weights loaded from step %d" % model.step)
  45. # Synthesize using same reduction factor as the model is currently trained
  46. r = np.int32(model.r)
  47. # Set model to eval mode (disable gradient and zoneout)
  48. model.eval()
  49. # Initialize the dataset
  50. metadata_fpath = in_dir.joinpath("train.txt")
  51. mel_dir = in_dir.joinpath("mels")
  52. embed_dir = in_dir.joinpath("embeds")
  53. dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
  54. collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
  55. data_loader = DataLoader(dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
  56. # Generate GTA mels
  57. meta_out_fpath = out_dir / "synthesized.txt"
  58. with meta_out_fpath.open("w") as file:
  59. for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
  60. texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)
  61. # Parallelize model onto GPUS using workaround due to python bug
  62. if device.type == "cuda" and torch.cuda.device_count() > 1:
  63. _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
  64. else:
  65. _, mels_out, _, _ = model(texts, mels, embeds)
  66. for j, k in enumerate(idx):
  67. # Note: outputs mel-spectrogram files and target ones have same names, just different folders
  68. mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
  69. mel_out = mels_out[j].detach().cpu().numpy().T
  70. # Use the length of the ground truth mel to remove padding from the generated mels
  71. mel_out = mel_out[:int(dataset.metadata[k][4])]
  72. # Write the spectrogram to disk
  73. np.save(mel_filename, mel_out, allow_pickle=False)
  74. # Write metadata into the synthesized file
  75. file.write("|".join(dataset.metadata[k]))