Source code for lhotse.recipes.callhome_egyptian

"""
About the Callhome Egyptian Arabic Corpus

  The CALLHOME Egyptian Arabic corpus of telephone speech consists of 120 unscripted
  telephone conversations between native speakers of Egyptian Colloquial Arabic (ECA),
  the spoken variety of Arabic found in Egypt. The dialect of ECA that this
  dictionary represents is Cairene Arabic.

  This recipe uses the speech and transcripts available through LDC. In addition,
  an Egyptian arabic phonetic lexicon (available via LDC) is used to get word to
  phoneme mappings for the vocabulary. This datasets are:

  Speech : LDC97S45
  Transcripts : LDC97T19
  Lexicon : LDC99L22
"""

from decimal import Decimal
from pathlib import Path
from typing import Dict, Optional, Union

from tqdm.auto import tqdm

from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet
from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
from lhotse.utils import Pathlike, check_and_rglob


[docs] def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome Egyptian Arabic Corpus We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S45`` package. :param transcript_dir: Path to the ``LDC97T19`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["train", "devtest", "evaltest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "callhome/arabic" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths) ) transcript_paths = check_and_rglob( transcript_dir / f"callhome_arabic_trans_970711/transcrp/{split}/roman", "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( id=f"{recording_id}_{idx}", recording_id=recording_id, start=start, duration=duration, speaker=f"{recording_id}_{spk}", text=text, ) ) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file( output_dir / f"callhome-egyptian_recordings_{split}.jsonl.gz" ) supervisions.to_file( output_dir / f"callhome-egyptian_supervisions_{split}.jsonl.gz" ) manifests[split] = {"recordings": recordings, "supervisions": supervisions} return manifests