123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- from datetime import datetime
- from functools import partial
- from multiprocessing import Pool
- from pathlib import Path
- import numpy as np
- from tqdm import tqdm
- from encoder import audio
- from encoder.config import librispeech_datasets, anglophone_nationalites
- from encoder.params_data import *
- _AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")
- class DatasetLog:
- """
- Registers metadata about the dataset in a text file.
- """
- def __init__(self, root, name):
- self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
- self.sample_data = dict()
- start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
- self.write_line("Creating dataset %s on %s" % (name, start_time))
- self.write_line("-----")
- self._log_params()
- def _log_params(self):
- from encoder import params_data
- self.write_line("Parameter values:")
- for param_name in (p for p in dir(params_data) if not p.startswith("__")):
- value = getattr(params_data, param_name)
- self.write_line("\t%s: %s" % (param_name, value))
- self.write_line("-----")
- def write_line(self, line):
- self.text_file.write("%s\n" % line)
- def add_sample(self, **kwargs):
- for param_name, value in kwargs.items():
- if not param_name in self.sample_data:
- self.sample_data[param_name] = []
- self.sample_data[param_name].append(value)
- def finalize(self):
- self.write_line("Statistics:")
- for param_name, values in self.sample_data.items():
- self.write_line("\t%s:" % param_name)
- self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
- self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
- self.write_line("-----")
- end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
- self.write_line("Finished on %s" % end_time)
- self.text_file.close()
- def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
- dataset_root = datasets_root.joinpath(dataset_name)
- if not dataset_root.exists():
- print("Couldn\'t find %s, skipping this dataset." % dataset_root)
- return None, None
- return dataset_root, DatasetLog(out_dir, dataset_name)
- def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
- # Give a name to the speaker that includes its dataset
- speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
- # Create an output directory with that name, as well as a txt file containing a
- # reference to each source file.
- speaker_out_dir = out_dir.joinpath(speaker_name)
- speaker_out_dir.mkdir(exist_ok=True)
- sources_fpath = speaker_out_dir.joinpath("_sources.txt")
- # There's a possibility that the preprocessing was interrupted earlier, check if
- # there already is a sources file.
- if sources_fpath.exists():
- try:
- with sources_fpath.open("r") as sources_file:
- existing_fnames = {line.split(",")[0] for line in sources_file}
- except:
- existing_fnames = {}
- else:
- existing_fnames = {}
- # Gather all audio files for that speaker recursively
- sources_file = sources_fpath.open("a" if skip_existing else "w")
- audio_durs = []
- for extension in _AUDIO_EXTENSIONS:
- for in_fpath in speaker_dir.glob("**/*.%s" % extension):
- # Check if the target output file already exists
- out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
- out_fname = out_fname.replace(".%s" % extension, ".npy")
- if skip_existing and out_fname in existing_fnames:
- continue
- # Load and preprocess the waveform
- wav = audio.preprocess_wav(in_fpath)
- if len(wav) == 0:
- continue
- # Create the mel spectrogram, discard those that are too short
- frames = audio.wav_to_mel_spectrogram(wav)
- if len(frames) < partials_n_frames:
- continue
- out_fpath = speaker_out_dir.joinpath(out_fname)
- np.save(out_fpath, frames)
- sources_file.write("%s,%s\n" % (out_fname, in_fpath))
- audio_durs.append(len(wav) / sampling_rate)
- sources_file.close()
- return audio_durs
- def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
- print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
- # Process the utterances for each speaker
- work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
- with Pool(4) as pool:
- tasks = pool.imap(work_fn, speaker_dirs)
- for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
- for sample_dur in sample_durs:
- logger.add_sample(duration=sample_dur)
- logger.finalize()
- print("Done preprocessing %s.\n" % dataset_name)
- def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
- for dataset_name in librispeech_datasets["train"]["other"]:
- # Initialize the preprocessing
- dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
- if not dataset_root:
- return
- # Preprocess all speakers
- speaker_dirs = list(dataset_root.glob("*"))
- _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
- def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
- # Initialize the preprocessing
- dataset_name = "VoxCeleb1"
- dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
- if not dataset_root:
- return
- # Get the contents of the meta file
- with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
- metadata = [line.split("\t") for line in metafile][1:]
- # Select the ID and the nationality, filter out non-anglophone speakers
- nationalities = {line[0]: line[3] for line in metadata}
- keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
- nationality.lower() in anglophone_nationalites]
- print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
- (len(keep_speaker_ids), len(nationalities)))
- # Get the speaker directories for anglophone speakers only
- speaker_dirs = dataset_root.joinpath("wav").glob("*")
- speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
- speaker_dir.name in keep_speaker_ids]
- print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
- (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
- # Preprocess all speakers
- _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
- def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
- # Initialize the preprocessing
- dataset_name = "VoxCeleb2"
- dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
- if not dataset_root:
- return
- # Get the speaker directories
- # Preprocess all speakers
- speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
- _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
|