Source code for lhotse.recipes.aishell

"""
About the Aishell corpus
Aishell is an open-source Chinese Mandarin speech corpus published by Beijing Shell Shell Technology Co.,Ltd.
publicly available on https://www.openslr.org/33
"""

import logging
import os
import shutil
import tarfile
from collections import defaultdict
from pathlib import Path
from typing import Dict, Optional, Union

from tqdm.auto import tqdm

from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.qa import fix_manifests
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, resumable_download, safe_extract


def text_normalize(line: str):
    """
    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54
    sed 's/a/a/g' | sed 's/b/b/g' |\
    sed 's/c/c/g' | sed 's/k/k/g' |\
    sed 's/t/t/g' > $dir/transcripts.t

    """
    line = line.replace("a", "a")
    line = line.replace("b", "b")
    line = line.replace("c", "c")
    line = line.replace("k", "k")
    line = line.replace("t", "t")
    line = line.upper()
    return line


[docs] def download_aishell( target_dir: Pathlike = ".", force_download: bool = False, base_url: str = "http://www.openslr.org/resources", ) -> Path: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ url = f"{base_url}/33" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) corpus_dir = target_dir / "aishell" dataset_tar_name = "data_aishell.tgz" resources_tar_name = "resource_aishell.tgz" for tar_name in [dataset_tar_name, resources_tar_name]: tar_path = target_dir / tar_name extracted_dir = corpus_dir / tar_name[:-4] completed_detector = extracted_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping download of {tar_name} because {completed_detector} exists." ) continue resumable_download( f"{url}/{tar_name}", filename=tar_path, force_download=force_download ) shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: safe_extract(tar, path=corpus_dir) if tar_name == dataset_tar_name: wav_dir = extracted_dir / "wav" for sub_tar_name in os.listdir(wav_dir): with tarfile.open(wav_dir / sub_tar_name) as tar: safe_extract(tar, path=wav_dir) completed_detector.touch() return corpus_dir
[docs] def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / "data_aishell/transcript/aishell_transcript_v0.8.txt" transcript_dict = {} with open(transcript_path, "r", encoding="utf-8") as f: for line in f.readlines(): idx_transcript = line.split() content = " ".join(idx_transcript[1:]) content = text_normalize(content) transcript_dict[idx_transcript[0]] = content manifests = defaultdict(dict) dataset_parts = ["train", "dev", "test"] for part in tqdm( dataset_parts, desc="Process aishell audio, it takes about 102 seconds.", ): logging.info(f"Processing aishell subset: {part}") # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / "data_aishell" / "wav" / f"{part}" for audio_path in wav_path.rglob("**/*.wav"): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f"No transcript: {idx}") logging.warning(f"{audio_path} has no transcript.") continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip().replace(" ", ""), # here we remove the space between words in the text # in advance. ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) recording_set, supervision_set = fix_manifests(recording_set, supervision_set) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file( output_dir / f"aishell_supervisions_{part}.jsonl.gz" ) recording_set.to_file(output_dir / f"aishell_recordings_{part}.jsonl.gz") manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} return manifests