Source code for lhotse.recipes.fleurs

"""
This recipe provides functionality for downloading and preparing the fleurs
corpus. The data is hosted on huggingface and to enable more control of the
download format, we use the streaming download interface and save each audio
file as it is streamed. The download can take quite some time.

The fleurs corpus consist of data in 102 languages spoken by multiple speakers.
There is about 10 hrs of trainign data in each language with smaller
accompanying dev and test sets. Full details can be found in

@inproceedings{conneau2023fleurs,
  title={Fleurs: Few-shot learning evaluation of universal representations of speech},
  author={Conneau, Alexis and Ma, Min and Khanuja, Simran and Zhang, Yu and Axelrod, Vera and Dalmia, Siddharth and Riesa, Jason and Rivera, Clara and Bapna, Ankur},
  booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
  pages={798--805},
  year={2023},
  organization={IEEE}
}
"""
import logging
from pathlib import Path
from typing import Dict, Optional, Sequence, Union

from tqdm import tqdm

from lhotse import (
    Recording,
    RecordingSet,
    SupervisionSegment,
    SupervisionSet,
    audio,
    fix_manifests,
    get_ffmpeg_torchaudio_info_enabled,
    set_ffmpeg_torchaudio_info_enabled,
)
from lhotse.parallel import parallel_map
from lhotse.utils import Pathlike, is_module_available

# The FLEURS languages are indicated by 2-letter ISO-codes followed by a
# country code, i.e.,
#
#  en_us, fr_fr, ml_in
#
# for American English, French French and Indian Malayalam respectively.

DEFAULT_LANGUAGES = [
    "af_za",
    "am_et",
    "ar_eg",
    "as_in",
    "ast_es",
    "az_az",
    "be_by",
    "bg_bg",
    "bn_in",
    "bs_ba",
    "ca_es",
    "ceb_ph",
    "ckb_iq",
    "cmn_hans_cn",
    "cs_cz",
    "cy_gb",
    "da_dk",
    "de_de",
    "el_gr",
    "en_us",
    "es_419",
    "et_ee",
    "fa_ir",
    "ff_sn",
    "fi_fi",
    "fil_ph",
    "fr_fr",
    "ga_ie",
    "gl_es",
    "gu_in",
    "ha_ng",
    "he_il",
    "hi_in",
    "hr_hr",
    "hu_hu",
    "hy_am",
    "id_id",
    "ig_ng",
    "is_is",
    "it_it",
    "ja_jp",
    "jv_id",
    "ka_ge",
    "kam_ke",
    "kea_cv",
    "kk_kz",
    "km_kh",
    "kn_in",
    "ko_kr",
    "ky_kg",
    "lb_lu",
    "lg_ug",
    "ln_cd",
    "lo_la",
    "lt_lt",
    "luo_ke",
    "lv_lv",
    "mi_nz",
    "mk_mk",
    "ml_in",
    "mn_mn",
    "mr_in",
    "ms_my",
    "mt_mt",
    "my_mm",
    "nb_no",
    "ne_np",
    "nl_nl",
    "nso_za",
    "ny_mw",
    "oc_fr",
    "om_et",
    "or_in",
    "pa_in",
    "pl_pl",
    "ps_af",
    "pt_br",
    "ro_ro",
    "ru_ru",
    "sd_in",
    "sk_sk",
    "sl_si",
    "sn_zw",
    "so_so",
    "sr_rs",
    "sv_se",
    "sw_ke",
    "ta_in",
    "te_in",
    "tg_tj",
    "th_th",
    "tr_tr",
    "uk_ua",
    "umb_ao",
    "ur_pk",
    "uz_uz",
    "vi_vn",
    "wo_sn",
    "xh_za",
    "yo_ng",
    "yue_hant_hk",
    "zu_za",
]


[docs]def download_fleurs(
    target_dir: Pathlike = ".",
    languages: Optional[Union[str, Sequence[str]]] = "all",
    force_download: Optional[bool] = False,
) -> Path:
    """
    Download the specified fleurs datasets.

    :param target_dir: The path to which the corpus will be downloaded.
    :type target_dir: Pathlike
    :param languages: Optional list of str or str specifying which
        languages to download. The str specifier for a language has the
        ISOCODE_COUNTRYCODE format, and is all lower case. By default
        this is set to "all", which will download the entire set of
        languages.
    :type languages: Optional[Union[str, Sequence[str]]]
    :param force_download: Specifies whether to overwrite an existing
        archive.
    :type force_download: bool
    :return: The root path of the downloaded data
    :rtype: Path
    """
    target_dir = Path(target_dir)
    corpus_dir = target_dir / "fleurs"
    metadata_dir = corpus_dir / "metadata"
    metadata_dir.mkdir(parents=True, exist_ok=True)

    if isinstance(languages, str) and languages == "all" or languages[0] == "all":
        languages = DEFAULT_LANGUAGES

    if isinstance(languages, str):
        languages = [languages]

    for lang in tqdm(languages):
        # Download one language at a time
        lang_dir = corpus_dir / lang
        download_single_fleurs_language(
            lang_dir,
            lang,
            force_download,
        )
    return corpus_dir


def download_single_fleurs_language(
    target_dir: Pathlike,
    language: str,
    force_download: bool = False,
) -> Path:
    """
    Download a single fleurs language

    :param target_dir: The path to which one langauge will be downloaded
    :type target_dir: Pathlike
    :param language: The code for the specified language
    :type language: str
    :param force_download: Specifies whether to overwrite an existing
        archive.
    :type force_download: bool
    :return: The path to the downloaded data for the specified language
    :rtype: Path
    """
    if not is_module_available("datasets"):
        raise ImportError(
            "The huggingface datasets package is not installed. Please install"
            " ...(pip install datasets)"
        )
    else:
        from datasets import load_dataset

    def _identity(x):
        return x

    target_dir = Path(target_dir)
    metadata_dir = target_dir.parents[0] / "metadata" / language
    target_dir.mkdir(parents=True, exist_ok=True)
    metadata_dir.mkdir(parents=True, exist_ok=True)

    completed_detector = target_dir / f".{language}_completed"
    if completed_detector.is_file() and not force_download:
        logging.info("Skipping dowload because {completed_detector} exists.")
        return target_dir

    for split in tqdm(["train", "validation", "test"]):
        fleurs = load_dataset(
            "google/fleurs",
            language,
            cache_dir="/expscratch/mwiesner/geolocation/test",
            streaming=True,
            split=split,
        )
        metadata = []
        osplit = "dev" if split == "validation" else split
        split_dir = target_dir / osplit
        split_dir.mkdir(parents=True, exist_ok=True)
        for data in tqdm(fleurs, desc=f"Downloading data from {language}-{osplit}"):
            audio.save_audio(
                f"{split_dir}/{Path(data['audio']['path']).name}",
                data["audio"]["array"],
                data["audio"]["sampling_rate"],
            )
            metadata_ = [
                str(data["id"]),  # ID
                Path(data["audio"]["path"]).name,  # filename
                data["raw_transcription"],  # raw transcript
                data["transcription"],  # transcript
                " ".join("|".join(data["transcription"].split())) + " |",  # chars
                str(data["num_samples"]),  # number of audio samples
                "FEMALE" if data["gender"] == 1 else "MALE",  # gender
            ]
            metadata.append(metadata_)
        with open(metadata_dir / f"{osplit}.tsv", "w") as f:
            for md in metadata:
                print("\t".join(md), file=f)

    completed_detector.touch()
    return target_dir


[docs]def prepare_fleurs(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    languages: Optional[Union[str, Sequence[str]]] = "all",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]:
    """
    Prepares the manifest for all of the FLEURS languages requested.

    :param corpus_dir: Path to the root where the FLEURS data are stored.
    :type corpus_dir: Pathlike,
    :param output_dir: The directory where the .jsonl.gz manifests will be written.
    :type output_dir: Pathlike,
    :param langauges: str or str sequence specifying the languages to prepare.
        The str 'all' prepares all 102 languages.
    :return: The manifest
    :rtype: Dict[str, Dict[str, Union[RecordingSet, Supervisions]]]]
    """

    if isinstance(corpus_dir, str):
        corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if isinstance(output_dir, str):
        output_dir = Path(output_dir)

    output_dir.mkdir(mode=511, parents=True, exist_ok=True)

    langs_list = DEFAULT_LANGUAGES
    if isinstance(languages, str) and languages != "all":
        langs_list = [languages]
    elif isinstance(languages, list) or isinstance(languages, tuple):
        if languages[0] != "all":
            langs_list = languages

    # Start buildings the recordings and supervisions
    manifests = {}
    for lang in langs_list:
        corpus_dir_lang = corpus_dir / f"{lang}"
        if not corpus_dir_lang.is_dir():
            logging.info(f"Skipping {lang}. No directory {corpus_dir_lang} found.")
            continue
        output_dir_lang = output_dir / f"{lang}"
        output_dir_lang.mkdir(mode=511, parents=True, exist_ok=True)
        manifests[lang] = prepare_single_fleurs_language(
            corpus_dir_lang,
            output_dir_lang,
            language=lang,
            num_jobs=num_jobs,
        )

    if output_dir is not None:
        for l in manifests:
            for dset in ("train", "dev", "test"):
                manifests[l][dset]["supervisions"].to_file(
                    output_dir / f"{l}" / f"fleurs-{l}_supervisions_{dset}.jsonl.gz"
                )
                manifests[l][dset]["recordings"].to_file(
                    output_dir / f"{l}" / f"fleurs-{l}_recordings_{dset}.jsonl.gz"
                )
    return manifests


def _make_recording(path):
    return Recording.from_file(path, recording_id=Path(path).stem)


def prepare_single_fleurs_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    language: str = "language",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single FLEURS language.

    :param corpus_dir: Path to the root where the FLEURS data are stored.
    :type corpus_dir: Pathlike,
    :param output_dir: The directory where the .jsonl.gz manifests will be written.
    :type output_dir: Pathlike,
    :param langauge: str specifying the language to prepare.

    :return: The manifest
    :rtype: Dict[str, Dict[str, Union[RecordingSet, Supervisions]]]]
    """

    if isinstance(corpus_dir, str):
        corpus_dir = Path(corpus_dir)

    recordings = {"train": [], "dev": [], "test": []}
    supervisions = {"train": [], "dev": [], "test": []}

    # First prepare the supervisions
    for dset in ("train", "dev", "test"):
        print(f"Preparing {dset} ...")
        prompt_ids = {}
        with open(
            corpus_dir.parents[0] / "metadata" / corpus_dir.stem / f"{dset}.tsv"
        ) as f:
            for l in f:
                vals = l.strip().split("\t")
                prompt_id, fname, raw_text, text, _, nsamples, gender = vals
                if prompt_id not in prompt_ids:
                    prompt_ids[prompt_id] = 0
                prompt_ids[prompt_id] += 1
                fname = Path(fname).stem
                supervisions[dset].append(
                    SupervisionSegment(
                        id=f"{prompt_id}_{prompt_ids[prompt_id]}_{fname}",
                        recording_id=fname,
                        start=0.0,
                        duration=round(int(nsamples) / 16000, 4),
                        channel=0,
                        text=text,
                        language=language,
                        speaker=f"{prompt_id}_{prompt_ids[prompt_id]}",
                        gender=gender,
                        custom={"raw_text": raw_text},
                    )
                )
    for dset in ("train", "dev", "test"):
        for reco in tqdm(
            parallel_map(
                _make_recording,
                (
                    corpus_dir / f"{dset}/{s.recording_id}.wav"
                    for s in supervisions[dset]
                ),
                num_jobs=num_jobs,
            ),
            desc=f"Making recordings from {language} {dset}",
        ):
            recordings[dset].append(reco)
    manifests = {}
    for dset in ("train", "dev", "test"):
        sups = SupervisionSet.from_segments(supervisions[dset])
        recos = RecordingSet.from_recordings(recordings[dset])
        recos, sups = fix_manifests(recos, sups)
        manifests[dset] = {"supervisions": sups, "recordings": recos}
    return manifests