audio.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. from scipy.ndimage.morphology import binary_dilation
  2. from encoder.params_data import *
  3. from pathlib import Path
  4. from typing import Optional, Union
  5. from warnings import warn
  6. import numpy as np
  7. import librosa
  8. import struct
  9. try:
  10. import webrtcvad
  11. except:
  12. warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
  13. webrtcvad=None
  14. int16_max = (2 ** 15) - 1
  15. def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
  16. source_sr: Optional[int] = None,
  17. normalize: Optional[bool] = True,
  18. trim_silence: Optional[bool] = True):
  19. """
  20. Applies the preprocessing operations used in training the Speaker Encoder to a waveform
  21. either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
  22. :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
  23. just .wav), either the waveform as a numpy array of floats.
  24. :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
  25. preprocessing. After preprocessing, the waveform's sampling rate will match the data
  26. hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
  27. this argument will be ignored.
  28. """
  29. # Load the wav from disk if needed
  30. if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
  31. wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
  32. else:
  33. wav = fpath_or_wav
  34. # Resample the wav if needed
  35. if source_sr is not None and source_sr != sampling_rate:
  36. wav = librosa.resample(wav, source_sr, sampling_rate)
  37. # Apply the preprocessing: normalize volume and shorten long silences
  38. if normalize:
  39. wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
  40. if webrtcvad and trim_silence:
  41. wav = trim_long_silences(wav)
  42. return wav
  43. def wav_to_mel_spectrogram(wav):
  44. """
  45. Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
  46. Note: this not a log-mel spectrogram.
  47. """
  48. frames = librosa.feature.melspectrogram(
  49. wav,
  50. sampling_rate,
  51. n_fft=int(sampling_rate * mel_window_length / 1000),
  52. hop_length=int(sampling_rate * mel_window_step / 1000),
  53. n_mels=mel_n_channels
  54. )
  55. return frames.astype(np.float32).T
  56. def trim_long_silences(wav):
  57. """
  58. Ensures that segments without voice in the waveform remain no longer than a
  59. threshold determined by the VAD parameters in params.py.
  60. :param wav: the raw waveform as a numpy array of floats
  61. :return: the same waveform with silences trimmed away (length <= original wav length)
  62. """
  63. # Compute the voice detection window size
  64. samples_per_window = (vad_window_length * sampling_rate) // 1000
  65. # Trim the end of the audio to have a multiple of the window size
  66. wav = wav[:len(wav) - (len(wav) % samples_per_window)]
  67. # Convert the float waveform to 16-bit mono PCM
  68. pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
  69. # Perform voice activation detection
  70. voice_flags = []
  71. vad = webrtcvad.Vad(mode=3)
  72. for window_start in range(0, len(wav), samples_per_window):
  73. window_end = window_start + samples_per_window
  74. voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
  75. sample_rate=sampling_rate))
  76. voice_flags = np.array(voice_flags)
  77. # Smooth the voice detection with a moving average
  78. def moving_average(array, width):
  79. array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
  80. ret = np.cumsum(array_padded, dtype=float)
  81. ret[width:] = ret[width:] - ret[:-width]
  82. return ret[width - 1:] / width
  83. audio_mask = moving_average(voice_flags, vad_moving_average_width)
  84. audio_mask = np.round(audio_mask).astype(np.bool)
  85. # Dilate the voiced regions
  86. audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
  87. audio_mask = np.repeat(audio_mask, samples_per_window)
  88. return wav[audio_mask == True]
  89. def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
  90. if increase_only and decrease_only:
  91. raise ValueError("Both increase only and decrease only are set")
  92. dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
  93. if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
  94. return wav
  95. return wav * (10 ** (dBFS_change / 20))