preprocess.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. from multiprocessing.pool import Pool
  2. from synthesizer import audio
  3. from functools import partial
  4. from itertools import chain
  5. from encoder import inference as encoder
  6. from pathlib import Path
  7. from utils import logmmse
  8. from tqdm import tqdm
  9. import numpy as np
  10. import librosa
  11. def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
  12. no_alignments: bool, datasets_name: str, subfolders: str):
  13. # Gather the input directories
  14. dataset_root = datasets_root.joinpath(datasets_name)
  15. input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
  16. print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
  17. assert all(input_dir.exists() for input_dir in input_dirs)
  18. # Create the output directories for each output file type
  19. out_dir.joinpath("mels").mkdir(exist_ok=True)
  20. out_dir.joinpath("audio").mkdir(exist_ok=True)
  21. # Create a metadata file
  22. metadata_fpath = out_dir.joinpath("train.txt")
  23. metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
  24. # Preprocess the dataset
  25. speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
  26. func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
  27. hparams=hparams, no_alignments=no_alignments)
  28. job = Pool(n_processes).imap(func, speaker_dirs)
  29. for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
  30. for metadatum in speaker_metadata:
  31. metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
  32. metadata_file.close()
  33. # Verify the contents of the metadata file
  34. with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
  35. metadata = [line.split("|") for line in metadata_file]
  36. mel_frames = sum([int(m[4]) for m in metadata])
  37. timesteps = sum([int(m[3]) for m in metadata])
  38. sample_rate = hparams.sample_rate
  39. hours = (timesteps / sample_rate) / 3600
  40. print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
  41. (len(metadata), mel_frames, timesteps, hours))
  42. print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
  43. print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
  44. print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
  45. def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
  46. metadata = []
  47. for book_dir in speaker_dir.glob("*"):
  48. if no_alignments:
  49. # Gather the utterance audios and texts
  50. # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
  51. extensions = ["*.wav", "*.flac", "*.mp3"]
  52. for extension in extensions:
  53. wav_fpaths = book_dir.glob(extension)
  54. for wav_fpath in wav_fpaths:
  55. # Load the audio waveform
  56. wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
  57. if hparams.rescale:
  58. wav = wav / np.abs(wav).max() * hparams.rescaling_max
  59. # Get the corresponding text
  60. # Check for .txt (for compatibility with other datasets)
  61. text_fpath = wav_fpath.with_suffix(".txt")
  62. if not text_fpath.exists():
  63. # Check for .normalized.txt (LibriTTS)
  64. text_fpath = wav_fpath.with_suffix(".normalized.txt")
  65. assert text_fpath.exists()
  66. with text_fpath.open("r") as text_file:
  67. text = "".join([line for line in text_file])
  68. text = text.replace("\"", "")
  69. text = text.strip()
  70. # Process the utterance
  71. metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
  72. skip_existing, hparams))
  73. else:
  74. # Process alignment file (LibriSpeech support)
  75. # Gather the utterance audios and texts
  76. try:
  77. alignments_fpath = next(book_dir.glob("*.alignment.txt"))
  78. with alignments_fpath.open("r") as alignments_file:
  79. alignments = [line.rstrip().split(" ") for line in alignments_file]
  80. except StopIteration:
  81. # A few alignment files will be missing
  82. continue
  83. # Iterate over each entry in the alignments file
  84. for wav_fname, words, end_times in alignments:
  85. wav_fpath = book_dir.joinpath(wav_fname + ".flac")
  86. assert wav_fpath.exists()
  87. words = words.replace("\"", "").split(",")
  88. end_times = list(map(float, end_times.replace("\"", "").split(",")))
  89. # Process each sub-utterance
  90. wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
  91. for i, (wav, text) in enumerate(zip(wavs, texts)):
  92. sub_basename = "%s_%02d" % (wav_fname, i)
  93. metadata.append(process_utterance(wav, text, out_dir, sub_basename,
  94. skip_existing, hparams))
  95. return [m for m in metadata if m is not None]
  96. def split_on_silences(wav_fpath, words, end_times, hparams):
  97. # Load the audio waveform
  98. wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
  99. if hparams.rescale:
  100. wav = wav / np.abs(wav).max() * hparams.rescaling_max
  101. words = np.array(words)
  102. start_times = np.array([0.0] + end_times[:-1])
  103. end_times = np.array(end_times)
  104. assert len(words) == len(end_times) == len(start_times)
  105. assert words[0] == "" and words[-1] == ""
  106. # Find pauses that are too long
  107. mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
  108. mask[0] = mask[-1] = True
  109. breaks = np.where(mask)[0]
  110. # Profile the noise from the silences and perform noise reduction on the waveform
  111. silence_times = [[start_times[i], end_times[i]] for i in breaks]
  112. silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
  113. noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
  114. if len(noisy_wav) > hparams.sample_rate * 0.02:
  115. profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
  116. wav = logmmse.denoise(wav, profile, eta=0)
  117. # Re-attach segments that are too short
  118. segments = list(zip(breaks[:-1], breaks[1:]))
  119. segment_durations = [start_times[end] - end_times[start] for start, end in segments]
  120. i = 0
  121. while i < len(segments) and len(segments) > 1:
  122. if segment_durations[i] < hparams.utterance_min_duration:
  123. # See if the segment can be re-attached with the right or the left segment
  124. left_duration = float("inf") if i == 0 else segment_durations[i - 1]
  125. right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
  126. joined_duration = segment_durations[i] + min(left_duration, right_duration)
  127. # Do not re-attach if it causes the joined utterance to be too long
  128. if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
  129. i += 1
  130. continue
  131. # Re-attach the segment with the neighbour of shortest duration
  132. j = i - 1 if left_duration <= right_duration else i
  133. segments[j] = (segments[j][0], segments[j + 1][1])
  134. segment_durations[j] = joined_duration
  135. del segments[j + 1], segment_durations[j + 1]
  136. else:
  137. i += 1
  138. # Split the utterance
  139. segment_times = [[end_times[start], start_times[end]] for start, end in segments]
  140. segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
  141. wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
  142. texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
  143. # # DEBUG: play the audio segments (run with -n=1)
  144. # import sounddevice as sd
  145. # if len(wavs) > 1:
  146. # print("This sentence was split in %d segments:" % len(wavs))
  147. # else:
  148. # print("There are no silences long enough for this sentence to be split:")
  149. # for wav, text in zip(wavs, texts):
  150. # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
  151. # # when playing them. You shouldn't need to do that in your parsers.
  152. # wav = np.concatenate((wav, [0] * 16000))
  153. # print("\t%s" % text)
  154. # sd.play(wav, 16000, blocking=True)
  155. # print("")
  156. return wavs, texts
  157. def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
  158. skip_existing: bool, hparams):
  159. ## FOR REFERENCE:
  160. # For you not to lose your head if you ever wish to change things here or implement your own
  161. # synthesizer.
  162. # - Both the audios and the mel spectrograms are saved as numpy arrays
  163. # - There is no processing done to the audios that will be saved to disk beyond volume
  164. # normalization (in split_on_silences)
  165. # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
  166. # is why we re-apply it on the audio on the side of the vocoder.
  167. # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
  168. # without extra padding. This means that you won't have an exact relation between the length
  169. # of the wav and of the mel spectrogram. See the vocoder data loader.
  170. # Skip existing utterances if needed
  171. mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
  172. wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
  173. if skip_existing and mel_fpath.exists() and wav_fpath.exists():
  174. return None
  175. # Trim silence
  176. if hparams.trim_silence:
  177. wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
  178. # Skip utterances that are too short
  179. if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
  180. return None
  181. # Compute the mel spectrogram
  182. mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
  183. mel_frames = mel_spectrogram.shape[1]
  184. # Skip utterances that are too long
  185. if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
  186. return None
  187. # Write the spectrogram, embed and audio to disk
  188. np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
  189. np.save(wav_fpath, wav, allow_pickle=False)
  190. # Return a tuple describing this training example
  191. return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
  192. def embed_utterance(fpaths, encoder_model_fpath):
  193. if not encoder.is_loaded():
  194. encoder.load_model(encoder_model_fpath)
  195. # Compute the speaker embedding of the utterance
  196. wav_fpath, embed_fpath = fpaths
  197. wav = np.load(wav_fpath)
  198. wav = encoder.preprocess_wav(wav)
  199. embed = encoder.embed_utterance(wav)
  200. np.save(embed_fpath, embed, allow_pickle=False)
  201. def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
  202. wav_dir = synthesizer_root.joinpath("audio")
  203. metadata_fpath = synthesizer_root.joinpath("train.txt")
  204. assert wav_dir.exists() and metadata_fpath.exists()
  205. embed_dir = synthesizer_root.joinpath("embeds")
  206. embed_dir.mkdir(exist_ok=True)
  207. # Gather the input wave filepath and the target output embed filepath
  208. with metadata_fpath.open("r") as metadata_file:
  209. metadata = [line.split("|") for line in metadata_file]
  210. fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
  211. # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
  212. # Embed the utterances in separate threads
  213. func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
  214. job = Pool(n_processes).imap(func, fpaths)
  215. list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))