Source code for lhotse.recipes.timit

#!/usr/bin/env python3

# Copyright 2021 Xiaomi Corporation (Author: Mingshuang Luo)
# Apache 2.0

import glob
import logging
import zipfile
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from typing import Dict, Optional, Union

from tqdm.auto import tqdm

from lhotse import fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, resumable_download


[docs] def download_timit( target_dir: Pathlike = ".", force_download: bool = False, base_url: Optional[str] = "https://data.deepai.org/timit.zip", ) -> Path: """ Download and unzip the dataset TIMIT. :param target_dir: Pathlike, the path of the dir to store the dataset. :param force_download: bool, if True, download the zips no matter if the zips exists. :param base_url: str, the URL of the TIMIT dataset to download. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_name = "timit.zip" zip_path = target_dir / zip_name corpus_dir = zip_path.with_suffix("") completed_detector = corpus_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skipping {zip_name} because {completed_detector} exists.") return corpus_dir resumable_download(base_url, filename=zip_path, force_download=force_download) with zipfile.ZipFile(zip_path) as zip_file: corpus_dir.mkdir(parents=True, exist_ok=True) for names in tqdm(zip_file.namelist(), "Extracting files"): zip_file.extract(names, str(corpus_dir)) zip_path.unlink() completed_detector.touch() return corpus_dir
[docs] def prepare_timit( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_phones: int = 48, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consists of the Recodings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write and save the manifests. :param supervision_lvl: str='phone', the level of the supervision, 'phone', 'word' or 'text'. :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ["TRAIN", "DEV", "TEST"] phones_dict = {} if num_phones in [60, 48, 39]: phones_dict = get_phonemes(num_phones) else: raise ValueError("The value of num_phones must be in [60, 48, 39].") dev_spks, test_spks = get_speakers() for part in dataset_parts: wav_files = [] if part == "TRAIN": print("starting....") wav_files = glob.glob(str(corpus_dir) + "/data/TRAIN/*/*/*.WAV") # filter the SA (dialect sentences) # wav_files = list(filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) elif part == "DEV": wav_files = glob.glob(str(corpus_dir) + "/data/TEST/*/*/*.WAV") # filter the SA (dialect sentences) # wav_files = list(filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in dev_spks, wav_files) ) else: wav_files = glob.glob(str(corpus_dir) + "/data/TEST/*/*/*.WAV") # filter the SA (dialect sentences) # wav_files = list(filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in test_spks, wav_files) ) logging.debug(f"{part} dataset manifest generation.") recordings = [] supervisions = [] if num_jobs <= 1: for wav_file in tqdm(wav_files, f"Preparing {part} manifest"): try: recording, supervision = prepare_recording( wav_file, num_phones, phones_dict ) recordings.append(recording) supervisions.append(supervision) except FileNotFoundError as e: logging.warning(e.strerror) else: with ProcessPoolExecutor(num_jobs) as ex: results = [] for wav_file in wav_files: results.append( ex.submit(prepare_recording, wav_file, num_phones, phones_dict) ) for r in tqdm(results, f"Preparing {part} manifest"): try: recording, supervision = r.result() recordings.append(recording) supervisions.append(supervision) except FileNotFoundError as e: logging.warning(e.strerror) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) recording_set, supervision_set = fix_manifests(recording_set, supervision_set) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"timit_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"timit_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_recording(wav_file, num_phones, phones_dict): items = str(wav_file).strip().split("/") idx = items[-2] + "-" + items[-1][:-4] speaker = items[-2] text_file = Path(wav_file).with_suffix(".TXT") word_file = Path(wav_file).with_suffix(".WRD") phone_file = Path(wav_file).with_suffix(".PHN") recording = Recording.from_file(path=wav_file, recording_id=idx) with open(text_file, "r") as f: text = " ".join(f.read().rstrip("\n").split(" ")[2:]) word_alignments = [] with open(word_file, "r") as f: lines = f.readlines() for line in lines: st, et, word = line.strip().split(" ") start = float(st) / recording.sampling_rate end = float(et) / recording.sampling_rate word_alignments.append(AlignmentItem(word, start, end - start)) phone_alignments = [] with open(phone_file, "r") as f: lines = f.readlines() for line in lines: st, et, phone = line.strip().split(" ") start = float(st) / recording.sampling_rate end = float(et) / recording.sampling_rate if num_phones != 60: phone = phones_dict[phone] phone_alignments.append(AlignmentItem(phone, start, end - start)) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="English", speaker=speaker, gender="male" if speaker.lower().startswith("m") else "female", text=text.strip(), ) segment = segment.with_alignment("word", word_alignments).with_alignment( "phone", phone_alignments ) return recording, segment def get_phonemes(num_phones): """ Choose and convert the phones for modeling. :param num_phones: the number of phones for modeling. """ phonemes = {} if num_phones == int(48): logging.debug("Using 48 phones for modeling!") # This dictionary is used to convert the 60 phoneme set into the 48 one. phonemes["sil"] = "sil" phonemes["aa"] = "aa" phonemes["ae"] = "ae" phonemes["ah"] = "ah" phonemes["ao"] = "ao" phonemes["aw"] = "aw" phonemes["ax"] = "ax" phonemes["ax-h"] = "ax" phonemes["axr"] = "er" phonemes["ay"] = "ay" phonemes["b"] = "b" phonemes["bcl"] = "vcl" phonemes["ch"] = "ch" phonemes["d"] = "d" phonemes["dcl"] = "vcl" phonemes["dh"] = "dh" phonemes["dx"] = "dx" phonemes["eh"] = "eh" phonemes["el"] = "el" phonemes["em"] = "m" phonemes["en"] = "en" phonemes["eng"] = "ng" phonemes["epi"] = "epi" phonemes["er"] = "er" phonemes["ey"] = "ey" phonemes["f"] = "f" phonemes["g"] = "g" phonemes["gcl"] = "vcl" phonemes["h#"] = "sil" phonemes["hh"] = "hh" phonemes["hv"] = "hh" phonemes["ih"] = "ih" phonemes["ix"] = "ix" phonemes["iy"] = "iy" phonemes["jh"] = "jh" phonemes["k"] = "k" phonemes["kcl"] = "cl" phonemes["l"] = "l" phonemes["m"] = "m" phonemes["n"] = "n" phonemes["ng"] = "ng" phonemes["nx"] = "n" phonemes["ow"] = "ow" phonemes["oy"] = "oy" phonemes["p"] = "p" phonemes["pau"] = "sil" phonemes["pcl"] = "cl" phonemes["q"] = "" phonemes["r"] = "r" phonemes["s"] = "s" phonemes["sh"] = "sh" phonemes["t"] = "t" phonemes["tcl"] = "cl" phonemes["th"] = "th" phonemes["uh"] = "uh" phonemes["uw"] = "uw" phonemes["ux"] = "uw" phonemes["v"] = "v" phonemes["w"] = "w" phonemes["y"] = "y" phonemes["z"] = "z" phonemes["zh"] = "zh" elif num_phones == int(39): logging.debug("Using 39 phones for modeling!") # This dictionary is used to convert the 60 phoneme set into the 39 one. phonemes["sil"] = "sil" phonemes["aa"] = "aa" phonemes["ae"] = "ae" phonemes["ah"] = "ah" phonemes["ao"] = "aa" phonemes["aw"] = "aw" phonemes["ax"] = "ah" phonemes["ax-h"] = "ah" phonemes["axr"] = "er" phonemes["ay"] = "ay" phonemes["b"] = "b" phonemes["bcl"] = "sil" phonemes["ch"] = "ch" phonemes["d"] = "d" phonemes["dcl"] = "sil" phonemes["dh"] = "dh" phonemes["dx"] = "dx" phonemes["eh"] = "eh" phonemes["el"] = "l" phonemes["em"] = "m" phonemes["en"] = "n" phonemes["eng"] = "ng" phonemes["epi"] = "sil" phonemes["er"] = "er" phonemes["ey"] = "ey" phonemes["f"] = "f" phonemes["g"] = "g" phonemes["gcl"] = "sil" phonemes["h#"] = "sil" phonemes["hh"] = "hh" phonemes["hv"] = "hh" phonemes["ih"] = "ih" phonemes["ix"] = "ih" phonemes["iy"] = "iy" phonemes["jh"] = "jh" phonemes["k"] = "k" phonemes["kcl"] = "sil" phonemes["l"] = "l" phonemes["m"] = "m" phonemes["ng"] = "ng" phonemes["n"] = "n" phonemes["nx"] = "n" phonemes["ow"] = "ow" phonemes["oy"] = "oy" phonemes["p"] = "p" phonemes["pau"] = "sil" phonemes["pcl"] = "sil" phonemes["q"] = "" phonemes["r"] = "r" phonemes["s"] = "s" phonemes["sh"] = "sh" phonemes["t"] = "t" phonemes["tcl"] = "sil" phonemes["th"] = "th" phonemes["uh"] = "uh" phonemes["uw"] = "uw" phonemes["ux"] = "uw" phonemes["v"] = "v" phonemes["w"] = "w" phonemes["y"] = "y" phonemes["z"] = "z" phonemes["zh"] = "sh" else: logging.debug("Using 60 phones for modeling!") return phonemes def get_speakers(): # List of test speakers test_spk = [ "fdhc0", "felc0", "fjlm0", "fmgd0", "fmld0", "fnlp0", "fpas0", "fpkt0", "mbpm0", "mcmj0", "mdab0", "mgrt0", "mjdh0", "mjln0", "mjmp0", "mklt0", "mlll0", "mlnt0", "mnjm0", "mpam0", "mtas1", "mtls0", "mwbt0", "mwew0", ] # List of dev speakers dev_spk = [ "fadg0", "faks0", "fcal1", "fcmh0", "fdac1", "fdms0", "fdrw0", "fedw0", "fgjd0", "fjem0", "fjmg0", "fjsj0", "fkms0", "fmah0", "fmml0", "fnmr0", "frew0", "fsem0", "majc0", "mbdg0", "mbns0", "mbwm0", "mcsh0", "mdlf0", "mdls0", "mdvc0", "mers0", "mgjf0", "mglb0", "mgwt0", "mjar0", "mjfc0", "mjsw0", "mmdb1", "mmdm2", "mmjr0", "mmwh0", "mpdf0", "mrcs0", "mreb0", "mrjm4", "mrjr0", "mroa0", "mrtk0", "mrws1", "mtaa0", "mtdt0", "mteb0", "mthc0", "mwjg0", ] return dev_spk, test_spk