Source code for lhotse.recipes.earnings22

"""
About the Earnings 22 dataset:

    The Earnings 22 dataset ( also referred to as earnings22 ) is a 119-hour corpus
    of English-language earnings calls collected from global companies. The primary
    purpose is to serve as a benchmark for industrial and academic automatic speech
    recognition (ASR) models on real-world accented speech.

    This dataset has been submitted to Interspeech 2022. The paper describing our
    methods and results can be found on arXiv at https://arxiv.org/abs/2203.15591.

    @misc{https://doi.org/10.48550/arxiv.2203.15591,
    doi = {10.48550/ARXIV.2203.15591},
    url = {https://arxiv.org/abs/2203.15591},
    author = {Del Rio, Miguel and Ha, Peter and McNamara, Quinten and Miller, Corey and Chandra, Shipra},
    keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {Earnings-22: A Practical Benchmark for Accents in the Wild},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution Share Alike 4.0 International}
    }

"""


import logging
import string
from pathlib import Path
from typing import Dict, List, Optional, Union

from lhotse import fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike

_DEFAULT_URL = "https://github.com/revdotcom/speech-datasets"


def normalize(text: str) -> str:
    # Remove all punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert all upper case to lower case
    text = text.lower()
    return text


def read_metadata(path: Pathlike) -> Dict[str, List[str]]:
    with open(path) as f:
        f.readline()  # skip header
        out = dict()
        for line in f:
            line = line.split(",")
            out[line[0]] = line[1:-1]
        return out



[docs]
def download_earnings22(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    url: Optional[str] = _DEFAULT_URL,
) -> Path:
    """Download and untar the dataset.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
        The extracted files are saved to target_dir/earnings22/
        Please note that the github repository contains other additional datasets and
        using this call, you will be downloading all of them and then throwing them out.
    :param force_download: Bool, if True, download the tar file no matter
        whether it exists or not.
    :param url: str, the url to download the dataset.
    :return: the path to downloaded and extracted directory with data.
    """
    logging.error(
        "Downloading Earnings22 from github repository is not implemented. "
        + f"Please visit {_DEFAULT_URL} and download the files manually. Please "
        + "follow the instructions closely as you need to use git-lfs to download "
        + "some of the audio files."
    )



def parse_nlp_file(filename: Pathlike):
    with open(filename) as f:
        transcript = list()
        f.readline()  # skip header
        for line in f:
            line = line.split("|")
            transcript.append(line[0])
        return transcript



[docs]
def prepare_earnings22(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    normalize_text: bool = False,
) -> Union[RecordingSet, SupervisionSet]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. The structure is
        expected to mimic the structure in the github repository, notably
        the mp3 files will be searched for in [corpus_dir]/media and transcriptions
        in the directory [corpus_dir]/transcripts/nlp_references
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text.
    :return: (recordings, supervisions) pair

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case
        to lower case. This includes removing possibly important punctuations such as
        dashes and apostrophes.
    """

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    media_dir = corpus_dir / "media"
    audio_files = list(media_dir.glob("*.mp3"))
    assert len(audio_files) == 125

    audio_files.sort()
    recording_set = RecordingSet.from_recordings(
        Recording.from_file(p) for p in audio_files
    )

    nlp_dir = corpus_dir / "transcripts" / "nlp_references"
    nlp_files = list(nlp_dir.glob("*.nlp"))
    assert len(nlp_files) == 125

    metadata = read_metadata(corpus_dir / "metadata.csv")

    nlp_files.sort()
    supervision_segments = list()
    for nlp_file in nlp_files:
        id = nlp_file.stem
        text = " ".join(parse_nlp_file(nlp_file))
        if normalize_text:
            text = normalize(text)

        s = SupervisionSegment(
            id=id,
            recording_id=id,
            start=0.0,
            duration=recording_set[id].duration,  # recording.duration,
            channel=0,
            language=f"English-{metadata[id][4]}",
            text=text,
        )
        supervision_segments.append(s)
    supervision_set = SupervisionSet.from_segments(supervision_segments)

    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
    validate_recordings_and_supervisions(recording_set, supervision_set)
    if output_dir is not None:
        supervision_set.to_file(output_dir / "earnings22_supervisions_all.jsonl.gz")
        recording_set.to_file(output_dir / "earnings22_recordings_all.jsonl.gz")

    return recording_set, supervision_set