Source code for lhotse.recipes.heroico

import logging
import re
import tarfile
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, NamedTuple, Optional, Union

from lhotse import fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import AudioSource, Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, resumable_download, safe_extract

# files containing transcripts
heroico_dataset_answers = "heroico-answers.txt"
heroico_dataset_recordings = "heroico-recordings.txt"
usma_dataset = "usma-prompts.txt"

folds = ("train", "devtest", "test")


[docs] def download_heroico( target_dir: Pathlike = ".", force_download: Optional[bool] = False, url: Optional[str] = "http://www.openslr.org/resources/39", ) -> Path: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_name = f"LDC2006S37.tar.gz" tar_path = target_dir / tar_name completed_detector = target_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skipping {tar_name} because {completed_detector} exists.") return target_dir resumable_download( f"{url}/{tar_name}", filename=tar_path, force_download=force_download ) with tarfile.open(tar_path) as tar: safe_extract(tar, path=target_dir) completed_detector.touch() return target_dir
class HeroicoMetaData(NamedTuple): audio_path: Pathlike audio_info: Any text: str class UttInfo(NamedTuple): fold: str speaker: str prompt_id: str subcorpus: str utterance_id: str transcript: str
[docs] def prepare_heroico( speech_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param speech_dir: Pathlike, the path of the speech data dir. :param transcripts_dir: Pathlike, the path of the transcript data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'. """ import soundfile speech_dir = Path(speech_dir) transcript_dir = Path(transcript_dir) assert speech_dir.is_dir(), f"No such directory: {speech_dir}" assert transcript_dir.is_dir(), f"No such directory: {transcript_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # set some patterns to match fields in transcript files and filenames answers_line_pattern = re.compile(r"\d+/\d+\t.+") answers_path_pattern = re.compile("Answers_Spanish") heroico_recitations_line_pattern = re.compile(r"\d+\t.+") heroico_recitations_devtest_path_pattern = re.compile("Recordings_Spanish") heroico_recitations_train_path_pattern = re.compile("Recordings_Spanish") usma_line_pattern = re.compile(r"s\d+\t.+") usma_native_demo_pattern = re.compile( r"usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+" ) usma_native_path_pattern = re.compile("usma/native") usma_native_prompt_id_pattern = re.compile(r"s\d+") usma_nonnative_demo_pattern = re.compile( r"nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+" ) usma_nonnative_path_pattern = re.compile(r"nonnative.+\.wav") # Generate a mapping: utt_id -> (audio_path, audio_info, text) transcripts = defaultdict(dict) # store answers trnscripts answers_trans_path = Path(transcript_dir, heroico_dataset_answers) with open(answers_trans_path, encoding="iso-8859-1") as f: for line in f: line = line.rstrip() # some recordings do not have a transcript, skip them here if not answers_line_pattern.match(line): continue # IDs have the form speaker/prompt_id spk_utt, text = line.split(maxsplit=1) spk_id, prompt_id = spk_utt.split("/") utt_id = "-".join(["answers", spk_id, prompt_id]) transcripts[utt_id] = text # store heroico recitations transcripts heroico_recitations_trans_path = Path(transcript_dir, heroico_dataset_recordings) with open(heroico_recitations_trans_path, encoding="iso-8859-1") as f: for line in f: line = line.rstrip() if not heroico_recitations_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = "-".join(["heroico-recitations", idx]) transcripts[utt_id] = text # store usma transcripts usma_trans_path = Path(transcript_dir, usma_dataset) with open(usma_trans_path, encoding="iso-8859-1") as f: for line in f: line = line.rstrip() if not usma_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = "-".join(["usma", idx]) transcripts[utt_id] = text # store utterance info audio_paths = speech_dir.rglob("*.wav") uttdata = {} for wav_file in audio_paths: wav_path = Path(wav_file) path_components = wav_path.parts pid = wav_path.stem if re.findall(answers_path_pattern, str(wav_file)): # store utternce info for Heroico Answers spk = wav_path.parts[-2] utt_id = "-".join(["answers", spk, pid]) if utt_id not in transcripts: uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo( fold="train", speaker=spk, prompt_id=pid, subcorpus="answers", utterance_id=utt_id, transcript=transcripts[utt_id], ) elif re.findall(usma_native_path_pattern, str(wav_file)): # store utterance info for usma native data spk = wav_path.parts[-2] utt_id = "-".join(["usma", spk, pid]) trans_id = "-".join(["usma", pid]) if not usma_native_demo_pattern.match(spk): uttdata[str(wav_file)] = None if not usma_native_prompt_id_pattern.match(pid): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo( fold="test", speaker=spk, prompt_id=pid, subcorpus="usma", utterance_id=utt_id, transcript=transcripts[trans_id], ) elif re.findall(usma_nonnative_path_pattern, str(wav_file)): # store utterance data for usma nonnative data spk = wav_path.parts[-2] utt_id = "-".join(["usma", spk, pid]) trans_id = "-".join(["usma", pid]) if not usma_nonnative_demo_pattern.match(spk): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo( fold="test", speaker=spk, prompt_id=pid, subcorpus="usma", utterance_id=utt_id, transcript=transcripts[trans_id], ) elif int(pid) <= 354 or int(pid) >= 562: # store utterance info for heroico recitations for train dataset spk = wav_path.parts[-2] utt_id = "-".join(["heroico-recitations", spk, pid]) trans_id = "-".join(["heroico-recitations", pid]) uttdata[str(wav_file)] = UttInfo( fold="train", speaker=spk, prompt_id=pid, subcorpus="heroico-recitations", utterance_id=utt_id, transcript=transcripts[trans_id], ) elif int(pid) > 354 and int(pid) < 562: spk = wav_path.parts[-2] utt_id = "-".join(["heroico-recitations-repeats", spk, pid]) trans_id = "-".join(["heroico-recitations-repeats", pid]) uttdata[str(wav_file)] = UttInfo( fold="devtest", speaker=spk, prompt_id=pid, subcorpus="heroico-recitations-repeats", utterance_id=utt_id, transcript=transcripts[trans_id], ) else: logging.warning(f"No such file: {wav_file}") audio_paths = speech_dir.rglob("*.wav") audio_files = [w for w in audio_paths] for fld in folds: metadata = {} for wav_file in audio_files: wav_path = Path(wav_file) # skip files with no record if not uttdata[str(wav_file)]: continue # only process the current fold if uttdata[str(wav_file)].fold != fld: continue path_components = wav_path.parts prompt_id = wav_path.stem # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = soundfile.info(str(wav_file)) spk = wav_path.parts[-2] utt_id = "-".join([uttdata[str(wav_file)].subcorpus, spk, prompt_id]) metadata[utt_id] = HeroicoMetaData( audio_path=wav_file, audio_info=info, text=uttdata[str(wav_file)].transcript, ) # Audio audio = RecordingSet.from_recordings( Recording( id=idx, sources=[ AudioSource( type="file", channels=[0], source=str(metadata[idx].audio_path) ) ], sampling_rate=int(metadata[idx].audio_info.samplerate), num_samples=metadata[idx].audio_info.frames, duration=metadata[idx].audio_info.duration, ) for idx in metadata ) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language="Spanish", speaker=idx.split("-")[-2], text=metadata[idx].text, ) for idx in audio.recordings ) audio, supervision = fix_manifests(audio, supervision) validate_recordings_and_supervisions(audio, supervision) if output_dir is not None: supervision.to_file(output_dir / f"heroico_supervisions_{fld}.jsonl.gz") audio.to_file(output_dir / f"heroico_recordings_{fld}.jsonl.gz") manifests[fld] = {"recordings": audio, "supervisions": supervision} return manifests