Source code for lhotse.recipes.tedlium

"""
The following are the original TED-LIUM 3 README contents.

This is the TED-LIUM corpus release 3,
licensed under Creative Commons BY-NC-ND 3.0 (http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en).

All talks and text are property of TED Conferences LLC.

This new TED-LIUM release was made through a collaboration between the Ubiqus company and the LIUM (University of Le Mans, France)

---

Contents:

- 2351 audio talks in NIST sphere format (SPH), including talks from TED-LIUM 2: be careful, same talks but not same audio files (only these audio file must be used with the TED-LIUM 3 STM files)
--> 452 hours of audio
- 2351 aligned automatic transcripts in STM format

- TEDLIUM 2 dev and test data: 19 TED talks in SPH format with corresponding manual transcriptions (cf. 'legacy' distribution below).

- Dictionary with pronunciations (159848 entries), same file as the one included in TED-LIUM 2
- Selected monolingual data for language modeling from WMT12 publicly available corpora: these files come from the TED-LIUM 2 release, but have been modified to get a tokenization more relevant for English language

- Two corpus distributions:
-- the legacy one, on which the dev and test datasets are the same as in TED-LIUM 2 (and TED-LIUM 1).
-- the 'speaker adaptation' one, especially designed for experiments on speaker adaptation.

---

SPH format info:

Channels       : 1
Sample Rate    : 16000
Precision      : 16-bit
Bit Rate       : 256k
Sample Encoding: 16-bit Signed Integer PCM

---

François Hernandez, Vincent Nguyen, Sahar Ghannay, Natalia Tomashenko, and Yannick Estève, "TED-LIUM 3: twice as much data and corpus repartition for experiments on speaker adaptation", submitted to the 20th International Conference on Speech and Computer (SPECOM 2018), September 2018, Leipzig, Germany
A preprint version is available on arxiv (and in the doc/ directory):
https://arxiv.org/abs/1805.04699
"""
import logging
import shutil
import tarfile
from concurrent.futures.thread import ThreadPoolExecutor
from functools import partial
from pathlib import Path
from typing import Dict, Optional, Sequence, Union

from lhotse import (
    RecordingSet,
    SupervisionSegment,
    SupervisionSet,
    validate_recordings_and_supervisions,
)
from lhotse.qa import fix_manifests
from lhotse.recipes.utils import normalize_text_tedlium
from lhotse.utils import Pathlike, resumable_download, safe_extract

TEDLIUM_PARTS = ("train", "dev", "test")



[docs]
def download_tedlium(
    target_dir: Pathlike = ".", force_download: Optional[bool] = False
) -> Path:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_path = target_dir / "TEDLIUM_release-3.tgz"
    corpus_dir = target_dir / "TEDLIUM_release-3"
    completed_detector = corpus_dir / ".completed"
    if completed_detector.is_file():
        logging.info(f"Skipping {tar_path.name} because {completed_detector} exists.")
        return corpus_dir
    resumable_download(
        "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
        filename=tar_path,
        force_download=force_download,
    )
    shutil.rmtree(corpus_dir, ignore_errors=True)
    with tarfile.open(tar_path) as tar:
        safe_extract(tar, path=target_dir)
    completed_detector.touch()
    return corpus_dir




[docs]
def prepare_tedlium(
    tedlium_root: Pathlike,
    output_dir: Optional[Pathlike] = None,
    dataset_parts: Union[str, Sequence[str]] = TEDLIUM_PARTS,
    num_jobs: int = 1,
    normalize_text: str = "none",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :param output_dir: Path where the manifests should be written.
    :param dataset_parts: Which parts of the dataset to prepare.
        By default, all parts are prepared.
    :param num_jobs: Number of parallel jobs to use.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}

    dataset_parts = [dataset_parts] if isinstance(dataset_parts, str) else dataset_parts

    with ThreadPoolExecutor(num_jobs) as ex:
        for split in dataset_parts:
            logging.info(f"Processing {split} split...")
            root = tedlium_root / "legacy" / split
            recordings = RecordingSet.from_dir(
                root / "sph", pattern="*.sph", num_jobs=num_jobs
            )
            stms = list((root / "stm").glob("*.stm"))
            assert len(stms) == len(recordings), (
                f"Mismatch: found {len(recordings)} "
                f"sphere files and {len(stms)} STM files. "
                f"You might be missing some parts of TEDLIUM..."
            )
            futures = []
            _parse_stm_worker = partial(_parse_stm_file, normalize_text=normalize_text)
            for stm in stms:
                futures.append(ex.submit(_parse_stm_worker, stm))

            segments = []
            for future in futures:
                segments.extend(future.result())

            supervisions = SupervisionSet.from_segments(segments)
            recordings, supervisions = fix_manifests(recordings, supervisions)

            corpus[split] = {"recordings": recordings, "supervisions": supervisions}
            validate_recordings_and_supervisions(**corpus[split])

            if output_dir is not None:
                recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz")
                supervisions.to_file(
                    output_dir / f"tedlium_supervisions_{split}.jsonl.gz"
                )

    return corpus



def _parse_stm_file(stm: str, normalize_text: str = "none") -> SupervisionSegment:
    """Helper function to parse a single STM file."""
    segments = []
    with stm.open() as f:
        for idx, l in enumerate(f):
            rec_id, _, _, start, end, _, *words = l.split()
            start, end = float(start), float(end)
            text = " ".join(words).replace("{NOISE}", "[NOISE]")
            if text == "ignore_time_segment_in_scoring":
                continue
            segments.append(
                SupervisionSegment(
                    id=f"{rec_id}-{idx}",
                    recording_id=rec_id,
                    start=start,
                    duration=round(end - start, ndigits=8),
                    channel=0,
                    text=normalize_text_tedlium(text, normalize_text),
                    language="English",
                    speaker=rec_id,
                )
            )
    return segments