Source code for lhotse.dataset.input_strategies

import logging
from typing import Callable, Dict, List, Tuple, Optional

import torch

from lhotse import CutSet, FeatureExtractor
from lhotse.cut import compute_supervisions_frame_mask
from lhotse.dataset.collation import collate_audio, collate_features, collate_vectors
from lhotse.utils import compute_num_frames, ifnone, supervision_to_frames, supervision_to_samples


[docs]class InputStrategy: """ Converts a :class:`CutSet` into a collated batch of audio representations. These representations can be e.g. audio samples or features. They might also be single or multi channel. This is a base class that only defines the interface. .. automethod:: __call__ """
[docs] def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]: """Returns a tensor with collated input signals, and a tensor of length of each signal before padding.""" raise NotImplementedError()
[docs] def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]: """ Returns a dict that specifies the start and end bounds for each supervision, as a 1-D int tensor. Depending on the strategy, the dict should look like: .. code-block: { "sequence_idx": tensor(shape=(S,)), "start_frame": tensor(shape=(S,)), "num_frames": tensor(shape=(S,)), } or .. code-block: { "sequence_idx": tensor(shape=(S,)), "start_sample": tensor(shape=(S,)), "num_samples": tensor(shape=(S,)) } Where ``S`` is the total number of supervisions encountered in the :class:`CutSet`. Note that ``S`` might be different than the number of cuts (``B``). ``sequence_idx`` means the index of the corresponding feature matrix (or cut) in a batch. """ raise NotImplementedError()
[docs] def supervision_masks(self, cuts: CutSet) -> torch.Tensor: """ Returns a collated batch of masks, marking the supervised regions in cuts. They are zero-padded to the longest cut. Depending on the strategy implementation, it is expected to be a tensor of shape ``(B, NF)`` or ``(B, NS)``, where ``B`` denotes the number of cuts, ``NF`` the number of frames and ``NS`` the total number of samples. ``NF`` and ``NS`` are determined by the longest cut in a batch. """ raise NotImplementedError()
[docs]class PrecomputedFeatures(InputStrategy): """ :class:`InputStrategy` that reads pre-computed features, whose manifests are attached to cuts, from disk. It pads the feature matrices, if needed. .. automethod:: __call__ """
[docs] def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]: """ Reads the pre-computed features from disk/other storage. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding.""" return collate_features(cuts)
[docs] def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]: """ Returns a dict that specifies the start and end bounds for each supervision, as a 1-D int tensor, in terms of frames: .. code-block: { "sequence_idx": tensor(shape=(S,)), "start_frame": tensor(shape=(S,)), "num_frames": tensor(shape=(S,)) } Where ``S`` is the total number of supervisions encountered in the :class:`CutSet`. Note that ``S`` might be different than the number of cuts (``B``). ``sequence_idx`` means the index of the corresponding feature matrix (or cut) in a batch. """ start_frames, nums_frames = zip(*( supervision_to_frames(sup, cut.frame_shift, cut.sampling_rate, max_frames=cut.num_frames) for cut in cuts for sup in cut.supervisions )) sequence_idx = [i for i, c in enumerate(cuts) for s in c.supervisions] return { 'sequence_idx': torch.tensor(sequence_idx, dtype=torch.int32), 'start_frame': torch.tensor(start_frames, dtype=torch.int32), 'num_frames': torch.tensor(nums_frames, dtype=torch.int32) }
[docs] def supervision_masks(self, cuts: CutSet, use_alignment_if_exists: Optional[str] = None) -> torch.Tensor: """Returns the mask for supervised frames. :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not exists, fall back on supervision time spans. """ return collate_vectors([cut.supervisions_feature_mask(use_alignment_if_exists=use_alignment_if_exists) for cut in cuts])
[docs]class AudioSamples(InputStrategy): """ :class:`InputStrategy` that reads single-channel recordings, whose manifests are attached to cuts, from disk (or other audio source). It pads the recordings, if needed. .. automethod:: __call__ """
[docs] def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]: """ Reads the audio samples from recordings on disk/other storage. The returned shape is ``(B, T) => (batch_size, num_samples)``. :return: a tensor with collated audio samples, and a tensor of ``num_samples`` of each cut before padding. """ return collate_audio(cuts)
[docs] def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]: """ Returns a dict that specifies the start and end bounds for each supervision, as a 1-D int tensor, in terms of samples: .. code-block: { "sequence_idx": tensor(shape=(S,)), "start_sample": tensor(shape=(S,)), "num_samples": tensor(shape=(S,)) } Where ``S`` is the total number of supervisions encountered in the :class:`CutSet`. Note that ``S`` might be different than the number of cuts (``B``). ``sequence_idx`` means the index of the corresponding feature matrix (or cut) in a batch. """ start_samples, nums_samples = zip(*( supervision_to_samples(sup, cut.sampling_rate) for cut in cuts for sup in cut.supervisions )) sequence_idx = [i for i, c in enumerate(cuts) for s in c.supervisions] return { 'sequence_idx': torch.tensor(sequence_idx, dtype=torch.int32), 'start_sample': torch.tensor(start_samples, dtype=torch.int32), 'num_samples': torch.tensor(nums_samples, dtype=torch.int32) }
[docs] def supervision_masks(self, cuts: CutSet, use_alignment_if_exists: Optional[str] = None) -> torch.Tensor: """Returns the mask for supervised samples. :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not exists, fall back on supervision time spans. """ return collate_vectors([cut.supervisions_audio_mask(use_alignment_if_exists=use_alignment_if_exists) for cut in cuts])
[docs]class OnTheFlyFeatures(InputStrategy): """ :class:`InputStrategy` that reads single-channel recordings, whose manifests are attached to cuts, from disk (or other audio source). Then, it uses a :class:`FeatureExtractor` to compute their features on-the-fly. It pads the recordings, if needed. .. note: The batch feature extraction performed here is not as efficient as it could be, but it allows to use arbitrary feature extraction method that may work on a single recording at a time. .. automethod:: __call__ """
[docs] def __init__( self, extractor: FeatureExtractor, wave_transforms: List[Callable[[torch.Tensor], torch.Tensor]] = None ) -> None: """ OnTheFlyFeatures' constructor. :param extractor: the feature extractor used on-the-fly (individually on each waveform). :param wave_transforms: an optional list of transforms applied on the batch of audio waveforms collated into a single tensor, right before the feature extraction. """ self.extractor = extractor self.wave_transforms = ifnone(wave_transforms, [])
[docs] def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]: """ Reads the audio samples from recordings on disk/other storage and computes their features. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding. """ audio, _ = collate_audio(cuts) for tfnm in self.wave_transforms: audio = tfnm(audio) features_single = [] for idx, cut in enumerate(cuts): samples = audio[idx].numpy() try: features = self.extractor.extract(samples, cuts[idx].sampling_rate) except: logging.error(f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}") raise features_single.append(torch.from_numpy(features)) features_batch = torch.stack(features_single) feature_lens = torch.tensor([ compute_num_frames( cut.duration, self.extractor.frame_shift, cut.sampling_rate ) for cut in cuts ], dtype=torch.int32) return features_batch, feature_lens
[docs] def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]: """ Returns a dict that specifies the start and end bounds for each supervision, as a 1-D int tensor, in terms of frames: .. code-block: { "sequence_idx": tensor(shape=(S,)), "start_frame": tensor(shape=(S,)), "num_frames": tensor(shape=(S,)) } Where ``S`` is the total number of supervisions encountered in the :class:`CutSet`. Note that ``S`` might be different than the number of cuts (``B``). ``sequence_idx`` means the index of the corresponding feature matrix (or cut) in a batch. """ start_frames, nums_frames = zip(*( supervision_to_frames(sup, self.extractor.frame_shift, cut.sampling_rate) for cut in cuts for sup in cut.supervisions )) sequence_idx = [i for i, c in enumerate(cuts) for s in c.supervisions] return { 'sequence_idx': torch.tensor(sequence_idx, dtype=torch.int32), 'start_frame': torch.tensor(start_frames, dtype=torch.int32), 'num_frames': torch.tensor(nums_frames, dtype=torch.int32) }
[docs] def supervision_masks(self, cuts: CutSet, use_alignment_if_exists: Optional[str] = None) -> torch.Tensor: """Returns the mask for supervised samples. :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not exists, fall back on supervision time spans. """ return collate_vectors( [ compute_supervisions_frame_mask( cut, frame_shift=self.extractor.frame_shift, use_alignment_if_exists=use_alignment_if_exists ) for cut in cuts ] )