Source code for lhotse.recipes.ljspeech

"""
The LJ Speech Dataset is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker
reading passages from 7 non-fiction books. A transcription is provided for each clip. Clips vary in length from 1 to
10 seconds and have a total length of approximately 24 hours.

The texts were published between 1884 and 1964, and are in the public domain. The audio was recorded in 2016-17 by
the LibriVox project and is also in the public domain.

See https://keithito.com/LJ-Speech-Dataset for more details.
"""

import logging
import re
import shutil
import tarfile
from pathlib import Path
from typing import Dict, Optional, Union

from lhotse import fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.features import Fbank
from lhotse.features.base import TorchaudioFeatureExtractor
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, fastcopy, resumable_download, safe_extract


[docs] def download_ljspeech( target_dir: Pathlike = ".", force_download: Optional[bool] = False ) -> Path: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_name = "LJSpeech-1.1" tar_path = target_dir / f"{dataset_name}.tar.bz2" corpus_dir = target_dir / dataset_name completed_detector = corpus_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skipping {dataset_name} because {completed_detector} exists.") return corpus_dir resumable_download( f"http://data.keithito.com/data/speech/{dataset_name}.tar.bz2", filename=tar_path, force_download=force_download, ) shutil.rmtree(corpus_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: safe_extract(tar, path=target_dir) completed_detector.touch() return corpus_dir
[docs] def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / "metadata.csv" assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}" recordings = [] supervisions = [] with open(metadata_csv_path) as f: for line in f: recording_id, text, normalized = line.strip().split("|") audio_path = corpus_dir / "wavs" / f"{recording_id}.wav" if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", gender="female", text=text, custom={"normalized_text": normalized.strip()}, ) recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) recording_set, supervision_set = fix_manifests(recording_set, supervision_set) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / "ljspeech_supervisions_all.jsonl.gz") recording_set.to_file(output_dir / "ljspeech_recordings_all.jsonl.gz") return {"recordings": recording_set, "supervisions": supervision_set}
def feature_extractor() -> TorchaudioFeatureExtractor: """ Set up the feature extractor for TTS task. :return: A feature extractor with custom parameters. """ feature_extractor = Fbank() feature_extractor.config.num_mel_bins = 80 return feature_extractor def text_normalizer(segment: SupervisionSegment) -> SupervisionSegment: text = segment.text.upper() text = re.sub(r"[^\w !?]", "", text) text = re.sub(r"^\s+", "", text) text = re.sub(r"\s+$", "", text) text = re.sub(r"\s+", " ", text) return fastcopy(segment, text=text)