hparams.py 4.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import ast
  2. import pprint
  3. class HParams(object):
  4. def __init__(self, **kwargs): self.__dict__.update(kwargs)
  5. def __setitem__(self, key, value): setattr(self, key, value)
  6. def __getitem__(self, key): return getattr(self, key)
  7. def __repr__(self): return pprint.pformat(self.__dict__)
  8. def parse(self, string):
  9. # Overrides hparams from a comma-separated string of name=value pairs
  10. if len(string) > 0:
  11. overrides = [s.split("=") for s in string.split(",")]
  12. keys, values = zip(*overrides)
  13. keys = list(map(str.strip, keys))
  14. values = list(map(str.strip, values))
  15. for k in keys:
  16. self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
  17. return self
  18. hparams = HParams(
  19. ### Signal Processing (used in both synthesizer and vocoder)
  20. sample_rate = 16000,
  21. n_fft = 800,
  22. num_mels = 80,
  23. hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
  24. win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
  25. fmin = 55,
  26. min_level_db = -100,
  27. ref_level_db = 20,
  28. max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
  29. preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
  30. preemphasize = True,
  31. ### Tacotron Text-to-Speech (TTS)
  32. tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
  33. tts_encoder_dims = 256,
  34. tts_decoder_dims = 128,
  35. tts_postnet_dims = 512,
  36. tts_encoder_K = 5,
  37. tts_lstm_dims = 1024,
  38. tts_postnet_K = 5,
  39. tts_num_highways = 4,
  40. tts_dropout = 0.5,
  41. tts_cleaner_names = ["english_cleaners"],
  42. tts_stop_threshold = -3.4, # Value below which audio generation ends.
  43. # For example, for a range of [-4, 4], this
  44. # will terminate the sequence at the first
  45. # frame that has all values < -3.4
  46. ### Tacotron Training
  47. tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
  48. (2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
  49. (2, 2e-4, 80_000, 12), #
  50. (2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
  51. (2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
  52. (2, 1e-5, 640_000, 12)], # lr = learning rate
  53. tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
  54. tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
  55. # Set to -1 to generate after completing epoch, or 0 to disable
  56. tts_eval_num_samples = 1, # Makes this number of samples
  57. ### Data Preprocessing
  58. max_mel_frames = 900,
  59. rescale = True,
  60. rescaling_max = 0.9,
  61. synthesis_batch_size = 16, # For vocoder preprocessing and inference.
  62. ### Mel Visualization and Griffin-Lim
  63. signal_normalization = True,
  64. power = 1.5,
  65. griffin_lim_iters = 60,
  66. ### Audio processing options
  67. fmax = 7600, # Should not exceed (sample_rate // 2)
  68. allow_clipping_in_normalization = True, # Used when signal_normalization = True
  69. clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
  70. use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
  71. symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
  72. # and [0, max_abs_value] if False
  73. trim_silence = True, # Use with sample_rate of 16000 for best results
  74. ### SV2TTS
  75. speaker_embedding_size = 256, # Dimension for the speaker embedding
  76. silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
  77. utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
  78. )
  79. def hparams_debug_string():
  80. return str(hparams)