encoder_preprocess.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2
  2. from utils.argutils import print_args
  3. from pathlib import Path
  4. import argparse
  5. if __name__ == "__main__":
  6. class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
  7. pass
  8. parser = argparse.ArgumentParser(
  9. description="Preprocesses audio files from datasets, encodes them as mel spectrograms and "
  10. "writes them to the disk. This will allow you to train the encoder. The "
  11. "datasets required are at least one of VoxCeleb1, VoxCeleb2 and LibriSpeech. "
  12. "Ideally, you should have all three. You should extract them as they are "
  13. "after having downloaded them and put them in a same directory, e.g.:\n"
  14. "-[datasets_root]\n"
  15. " -LibriSpeech\n"
  16. " -train-other-500\n"
  17. " -VoxCeleb1\n"
  18. " -wav\n"
  19. " -vox1_meta.csv\n"
  20. " -VoxCeleb2\n"
  21. " -dev",
  22. formatter_class=MyFormatter
  23. )
  24. parser.add_argument("datasets_root", type=Path, help=\
  25. "Path to the directory containing your LibriSpeech/TTS and VoxCeleb datasets.")
  26. parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
  27. "Path to the output directory that will contain the mel spectrograms. If left out, "
  28. "defaults to <datasets_root>/SV2TTS/encoder/")
  29. parser.add_argument("-d", "--datasets", type=str,
  30. default="librispeech_other,voxceleb1,voxceleb2", help=\
  31. "Comma-separated list of the name of the datasets you want to preprocess. Only the train "
  32. "set of these datasets will be used. Possible names: librispeech_other, voxceleb1, "
  33. "voxceleb2.")
  34. parser.add_argument("-s", "--skip_existing", action="store_true", help=\
  35. "Whether to skip existing output files with the same name. Useful if this script was "
  36. "interrupted.")
  37. parser.add_argument("--no_trim", action="store_true", help=\
  38. "Preprocess audio without trimming silences (not recommended).")
  39. args = parser.parse_args()
  40. # Verify webrtcvad is available
  41. if not args.no_trim:
  42. try:
  43. import webrtcvad
  44. except:
  45. raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
  46. "noise removal and is recommended. Please install and try again. If installation fails, "
  47. "use --no_trim to disable this error message.")
  48. del args.no_trim
  49. # Process the arguments
  50. args.datasets = args.datasets.split(",")
  51. if not hasattr(args, "out_dir"):
  52. args.out_dir = args.datasets_root.joinpath("SV2TTS", "encoder")
  53. assert args.datasets_root.exists()
  54. args.out_dir.mkdir(exist_ok=True, parents=True)
  55. # Preprocess the datasets
  56. print_args(args, parser)
  57. preprocess_func = {
  58. "librispeech_other": preprocess_librispeech,
  59. "voxceleb1": preprocess_voxceleb1,
  60. "voxceleb2": preprocess_voxceleb2,
  61. }
  62. args = vars(args)
  63. for dataset in args.pop("datasets"):
  64. print("Preprocessing %s" % dataset)
  65. preprocess_func[dataset](**args)