Source code for lhotse.recipes.cslu_kids

"""
Data preparation recipe for CSLU Kids corpus (https://catalog.ldc.upenn.edu/LDC2007S18):

Summary of corpus from LDC webpage:

Collection of spontaneous and prompted speech from 1100 children between Kindergarten
and Grade 10 in the Forest Grove School District in Oregon. All children -- approximately
100 children at each grade level -- read approximately 60 items from a total list of 319
phonetically-balanced but simple words, sentences or digit strings. Each utterance of
spontaneous speech begins with a recitation of the alphabet and contains a monologue of
about one minute in duration. This release consists of 1017 files containing approximately
8-10 minutes of speech per speaker. Corresponding word-level transcriptions are also included.

Prompted speech is verified and divided into following categories:

1 Good: Only the target word is said.
2 Maybe: Target word is present, but there's other junk in the file.
3 Bad: Target word is not said.
4 Puff: Same as good, but w/ an air puff.

This data is not available for free - your institution needs to have an LDC subscription.
"""
import logging
import re
from pathlib import Path
from typing import Dict, Optional, Union

from tqdm import tqdm

from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.qa import fix_manifests
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, check_and_rglob

NOISE_TAGS_REGEX = re.compile("<.*?>")


def read_text(file: Path, normalize: Optional[bool] = True) -> str:
    with open(file, "r") as f:
        text = f.read().replace("\n", " ")
        text = re.sub(NOISE_TAGS_REGEX, "", text) if normalize else text
    return text


[docs]def prepare_cslu_kids(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    normalize_text: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for CSLU Kids corpus. The supervision contains either the
    prompted text, or a transcription of the spontaneous speech, depending on
    whether the utterance was scripted or spontaneous.

    Additionally, the following information is present in the `custom` tag:
    scripted/spontaneous utterance, and verification label (rating between 1 and 4)
    for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt
    or top documentation in this script for more information).

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir

    # Get list of all recordings
    audio_paths = check_and_rglob(corpus_dir, "*.wav")

    # Read verification labels
    verification = {}
    for file in check_and_rglob(corpus_dir, "*-verified.txt"):
        with open(file, "r") as f:
            for line in f:
                path, label = line.strip().split()
                utt = Path(path).stem
                verification[utt] = int(label)

    # Read prompted transcriptions
    prompts = {}
    with open(corpus_dir / "docs" / "all.map", "r") as f:
        for line in f:
            if line.strip() != "":
                prompt, text = line.strip().split(maxsplit=1)
                prompts[prompt] = text[1:-1]  # remove " " around the text

    recordings = []
    supervisions = []
    for p in tqdm(audio_paths, desc="Preparing manifests"):

        # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav
        uttid = p.stem  # ks001000
        spk = p.parent.stem  # ks001
        cat = p.parent.parent.stem  # 0
        prompt = p.parent.parent.parent.stem  # 00
        type = p.parent.parent.parent.parent.stem  # scripted

        recording = Recording.from_file(
            p, relative_path_depth=None if absolute_paths else 3
        )
        recordings.append(recording)

        if type == "scripted":
            text = prompts[prompt]
            verification_label = verification[uttid] if uttid in verification else None
            custom = {"type": type, "verification_label": verification_label}
        elif type == "spontaneous":
            text = read_text(
                corpus_dir / "trans" / type / prompt / cat / spk / f"{uttid}.txt",
                normalize=normalize_text,
            )
            custom = {"type": type}
        supervisions.append(
            SupervisionSegment(
                id=uttid,
                recording_id=uttid,
                start=0,
                duration=recording.duration,
                speaker=spk,
                language="English",
                text=text,
                custom=custom,
            )
        )

    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = {
        "recordings": recordings,
        "supervisions": supervisions,
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        manifests["recordings"].to_file(
            output_dir / "cslu-kids_recordings_all.jsonl.gz"
        )
        manifests["supervisions"].to_file(
            output_dir / "cslu_kids_supervisions_all.jsonl.gz"
        )

    return manifests