"""
VoxPopuli provides
- 400K hours of unlabelled speech data for 23 languages
- 1.8K hours of transcribed speech data for 16 languages
- 17.3K hours of speech-to-speech interpretation data for 15x15 directions
- 29 hours of transcribed speech data of non-native English intended for research in ASR
for accented speech (15 L2 accents)
The raw data is collected from 2009-2020 European Parliament event recordings.
For details about the corpus, please refer to the website:
https://github.com/facebookresearch/voxpopuli
Reference:
Wang, Changhan et al. “VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
Learning, Semi-Supervised Learning and Interpretation.” Annual Meeting of the Association
for Computational Linguistics (2021).
This script is based on code from the repository linked above.
NOTE: Our data preparation is slightly different from the original repository. In particular,
we only use the metadata to create manifests, i.e., we do not create segment-level wav files,
unlike the original repository. In this way, we can avoid duplicating the audio files.
"""
import csv
import gzip
import logging
import re
import tarfile
from collections import defaultdict
from pathlib import Path
from typing import Dict, Optional, Tuple, Union
from torch.hub import download_url_to_file
from tqdm import tqdm
from lhotse import RecordingSet, SupervisionSegment, SupervisionSet
from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
from lhotse.utils import Pathlike, safe_extract
# fmt: off
LANGUAGES = [
"en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
"sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
]
LANGUAGES_V2 = [f"{x}_v2" for x in LANGUAGES]
YEARS = list(range(2009, 2020 + 1))
ASR_LANGUAGES = [
"en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
"sk", "sl", "et", "lt"
]
ASR_ACCENTED_LANGUAGES = [
"en_accented"
]
S2S_SRC_LANGUAGES = ASR_LANGUAGES
S2S_TGT_LANGUAGES = [
"en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr",
"sk", "sl", "et", "lt", "pt", "bg", "el", "lv", "mt", "sv", "da"
]
S2S_TGT_LANGUAGES_WITH_HUMAN_TRANSCRIPTION = ["en", "fr", "es"]
DOWNLOAD_BASE_URL = "https://dl.fbaipublicfiles.com/voxpopuli"
# fmt: on
[docs]
def download_voxpopuli(
target_dir: Pathlike = ".",
subset: Optional[str] = "asr",
) -> Path:
"""
Download and untar/unzip the VoxPopuli dataset.
:param target_dir: Pathlike, the path of the dir to storage the dataset.
:param subset: str, the subset of the dataset to download, can be one of "400k", "100k",
"10k", "asr", or any of the languages in LANGUAGES or LANGUAGES_V2.
:return: the path to downloaded and extracted directory with data.
"""
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
if subset in LANGUAGES_V2:
languages = [subset.split("_")[0]]
years = YEARS + [f"{y}_2" for y in YEARS]
elif subset in LANGUAGES:
languages = [subset]
years = YEARS
else:
languages = {
"400k": LANGUAGES,
"100k": LANGUAGES,
"10k": LANGUAGES,
"asr": ["original"],
}.get(subset, None)
years = {
"400k": YEARS + [f"{y}_2" for y in YEARS],
"100k": YEARS,
"10k": [2019, 2020],
"asr": YEARS,
}.get(subset, None)
url_list = []
for l in languages:
for y in years:
url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar")
out_root = target_dir / "raw_audios"
out_root.mkdir(exist_ok=True, parents=True)
logging.info(f"{len(url_list)} files to download...")
for url in tqdm(url_list):
tar_path = out_root / Path(url).name
download_url_to_file(url, tar_path)
with tarfile.open(tar_path, "r") as tar_file:
safe_extract(tar_file, out_root)
tar_path.unlink()
return target_dir
[docs]
def prepare_voxpopuli(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
task: str = "asr",
lang: str = "en",
source_lang: Optional[str] = None,
target_lang: Optional[str] = None,
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Prepares and returns the VoxPopuli manifests which consist of Recordings and Supervisions.
:param corpus_dir: Pathlike, the path of the data dir.
:param output_dir: Pathlike, the path where to write the manifests.
:param task: str, the task to prepare the manifests for, can be one of "asr", "s2s", "lm".
:param lang: str, the language to prepare the manifests for, can be one of LANGUAGES
or LANGUAGES_V2. This is used for "asr" and "lm" tasks.
:param source_lang: str, the source language for the s2s task, can be one of S2S_SRC_LANGUAGES.
:param target_lang: str, the target language for the s2s task, can be one of S2S_TGT_LANGUAGES.
:param num_jobs: int, the number of parallel jobs to use for preparing the manifests.
:return: Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]], the manifests.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
if task == "asr":
assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
manifests = _prepare_voxpopuli_asr(
corpus_dir, output_dir, lang, num_jobs=num_jobs
)
elif task == "s2s":
assert (
source_lang in S2S_SRC_LANGUAGES
), f"Unsupported source language: {source_lang}"
assert (
target_lang in S2S_TGT_LANGUAGES
), f"Unsupported target language: {target_lang}"
manifests = _prepare_voxpopuli_s2s(corpus_dir, source_lang, target_lang)
elif task == "lm":
assert lang in ASR_LANGUAGES, f"Unsupported language: {lang}"
manifests = _prepare_voxpopuli_lm(corpus_dir, lang)
for k, v in manifests.items():
recordings, supervisions = fix_manifests(**v)
validate_recordings_and_supervisions(
recordings=recordings, supervisions=supervisions
)
manifests[k]["recordings"] = recordings
manifests[k]["supervisions"] = supervisions
lang_affix = f"{source_lang}-{target_lang}" if task == "s2s" else lang
if output_dir is not None:
recordings.to_file(
output_dir / f"voxpopuli-{task}-{lang_affix}_recordings_{k}.jsonl.gz"
)
supervisions.to_file(
output_dir / f"voxpopuli-{task}-{lang_affix}_supervisions_{k}.jsonl.gz"
)
return manifests
class RecordingIdFn:
"""
This functor class avoids error in multiprocessing:
`AttributeError: Can't pickle local object '_prepare_voxpopuli_asr.<locals>.<lambda>'`
"""
def __init__(self, language: str):
self.language = language
def __call__(self, path: Path) -> str:
recording_id = re.sub(f"_{self.language}$", "", path.stem)
recording_id = re.sub("_original$", "", recording_id)
return recording_id
def _prepare_voxpopuli_asr(
corpus_dir: Path, output_dir: Path, lang: str, num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Download metadata TSV and prepare manifests for the ASR task.
"""
# First create recordings. We remove the affix "_original" from the recording ID
logging.info("Preparing recordings (this may take a few minutes)...")
in_root = corpus_dir / "raw_audios" / lang
recordings = RecordingSet.from_dir(
in_root,
"*.ogg",
num_jobs=num_jobs,
recording_id=RecordingIdFn(language=lang),
)
# Now create supervisions
# Get metadata TSV
url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{lang}.tsv.gz"
tsv_path = output_dir / Path(url).name
if not tsv_path.exists():
logging.info(f"Downloading : {url} -> {tsv_path}")
download_url_to_file(url, tsv_path)
else:
logging.info(f"Using pre-downloaded annotations {tsv_path}")
with gzip.open(tsv_path, "rt") as f:
metadata = [x for x in csv.DictReader(f, delimiter="|")]
# Get segment into list (train, dev, test)
segments = defaultdict(list)
# We also keep a count of the number of segments per recording
num_segments = defaultdict(lambda: 0)
for r in tqdm(metadata):
split = r["split"]
if split not in ["train", "dev", "test"]:
continue
reco_id = r["session_id"]
start_time = float(r["start_time"])
duration = float(r["end_time"]) - start_time
num_segments[reco_id] += 1
segments[split].append(
SupervisionSegment(
id=f"{reco_id}-{num_segments[reco_id]}",
recording_id=reco_id,
start=round(start_time, ndigits=8),
duration=round(duration, ndigits=8),
channel=0,
language=lang,
speaker=r["speaker_id"],
gender=r["gender"],
text=r["normed_text"],
custom={
"orig_text": r["original_text"],
},
)
)
# Get list of recording IDs for each split
reco_ids = defaultdict(list)
for split, segs in segments.items():
reco_ids[split] = sorted(set([s.recording_id for s in segs]))
manifests = defaultdict(dict)
for split in ["train", "dev", "test"]:
manifests[split]["recordings"] = recordings.filter(
lambda r: r.id in reco_ids[split]
)
manifests[split]["supervisions"] = SupervisionSet.from_segments(segments[split])
return manifests
def _prepare_voxpopuli_s2s(
corpus_dir: Path, source_lang: str, target_lang: str
) -> Tuple[RecordingSet, SupervisionSet]:
"""
Prepare the manifests for the s2s task.
"""
raise NotImplementedError
def _prepare_voxpopuli_lm(corpus_dir: Path, lang: str) -> Tuple[RecordingSet, None]:
"""
Prepare the manifests for the lm task.
"""
raise NotImplementedError