"""
Data preparation recipe for CMU Kids corpus (https://catalog.ldc.upenn.edu/LDC97S63):
Summary of corpus from LDC webpage:
This database is comprised of sentences read aloud by children. It was originally designed
in order to create a training set of children's speech for the SPHINX II automatic speech
recognizer for its use in the LISTEN project at Carnegie Mellon University.
The children range in age from six to eleven (see details below) and were in first through
third grades (the 11-year-old was in 6th grade) at the time of recording. There were 24 male
and 52 female speakers. There are 5,180 utterances in all.
The speakers come from two separate populations:
1. SIM95: They were recorded in the summer of 1995 and were enrolled in either the Chatham
College Summer Camp or the Mount Lebanon Extended Day Summer Fun program in Pittsburgh.
They were recorded on-site. There are 44 speakers and 3,333 utterances in this set. They
"good" reading examples.
2. FP: These are examples of errorful reading and dialectic variants. The readers come from
Fort Pitt School in Pittsburgh and were recorded in April 1996. There are 32 speakers and
1,847 utterances in this set.
The user should be aware that the speakers' dialect partly reflects what is locally called "Pittsburghese."
The corpus does not come with a train/dev/test split, and the Kaldi recipe splits it randomly
into 70%/30% train-test. We do not perform any splits, and just return the complete recording
and supervision manifests.
This data is not available for free - your institution needs to have an LDC subscription.
"""
import logging
from pathlib import Path
from typing import Dict, Optional, Union
from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.qa import fix_manifests
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike
[docs]
def prepare_cmu_kids(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
absolute_paths: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
"""
Prepare manifests for CMU Kids corpus. The prepared supervisions contain the
prompt text as the `text`. Additionally, in the `custom` tag, we provide the
following data: speaker grade/age, population where the speaker came from
(SIM95/FP), spoken transcript, and transcription bin (1/2).
Here, bin `1` means utterances where the speaker followed the prompt and no
noise/mispronunciation is present, and `2` refers to noisy utterances.
The tag `spoken_transcript` is the transcription that was actually spoken. It
contains noise tags and phone transcription in case the pronunciation differed
from that in CMU Dict.
:param corpus_dir: Path to downloaded LDC corpus.
:param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
:param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
:return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
"""
corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir
corpus_dir = corpus_dir.parent if corpus_dir.stem == "cmu_kids" else corpus_dir
recordings = []
supervisions = []
# Get transcripts for all utterances
utterances = {}
with open(corpus_dir / "cmu_kids" / "tables" / "sentence.tbl", "r") as f:
for line in f:
utt, count, text = line.strip().split("\t")
utterances[utt] = text
# Get speaker metadata
speaker_info = {}
with open(corpus_dir / "cmu_kids" / "tables" / "speaker.tbl", "r") as f:
for _ in range(2):
next(f)
for line in f:
# ID LOC GR/AGE TOT BIN2
# fabm SUM95 3/9 100 62
# facs SUM95 2/8 90 55
spk, pop, gr_age, _, _ = line.strip().split("\t")
grade, age = gr_age.split("/")
speaker_info[spk] = (pop, grade, age)
# Iterate through all transcriptions and add to supervisions
with open(corpus_dir / "cmu_kids" / "tables" / "transcrp.tbl", "r") as f:
for line in f:
trn_id, transcript = line.strip().split(maxsplit=1)
spk = trn_id[0:4]
utt = trn_id[4:7]
bin = int(trn_id[7])
pop, grade, age = speaker_info[spk]
audio_path = (
corpus_dir / "cmu_kids" / "kids" / spk / "signal" / f"{trn_id}.sph"
)
recording = Recording.from_file(
audio_path, relative_path_depth=None if absolute_paths else 3
)
recordings.append(recording)
supervisions.append(
SupervisionSegment(
id=trn_id,
recording_id=trn_id,
start=0,
duration=recording.duration,
speaker=spk,
gender="Male" if spk[0] == "m" else "Female",
language="English",
text=utterances[utt],
custom={
"speaker_grade": grade if grade != "NA" else None,
"speaker_age": int(age) if age != "NA" else None,
"speaker_population": pop,
"bin": bin,
"spoken_transcript": transcript,
},
)
)
recordings = RecordingSet.from_recordings(recordings)
supervisions = SupervisionSet.from_segments(supervisions)
recordings, supervisions = fix_manifests(recordings, supervisions)
validate_recordings_and_supervisions(recordings, supervisions)
manifests = {
"recordings": recordings,
"supervisions": supervisions,
}
if output_dir is not None:
logging.info("Writing manifests to JSONL files")
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
manifests["recordings"].to_file(output_dir / "cmu-kids_recordings_all.jsonl.gz")
manifests["supervisions"].to_file(
output_dir / "cmu-kids_supervisions_all.jsonl.gz"
)
return manifests