Source code for lhotse.cut.padding

import logging
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np
import torch

from lhotse.audio import Recording
from lhotse.audio.utils import VideoInfo
from lhotse.cut.base import Cut
from lhotse.features import FeatureExtractor
from lhotse.supervision import SupervisionSegment
from lhotse.utils import (
    LOG_EPSILON,
    Pathlike,
    Seconds,
    compute_num_frames,
    compute_num_samples,
    fastcopy,
    perturb_num_samples,
    uuid4,
)



[docs]
@dataclass
class PaddingCut(Cut):
    """
    :class:`~lhotse.cut.PaddingCut` is a dummy :class:`~lhotse.cut.Cut` that doesn't refer to
    actual recordings or features --it simply returns zero samples in the time domain
    and a specified features value in the feature domain.
    Its main role is to be appended to other cuts to make them evenly sized.

    Please refer to the documentation of :class:`~lhotse.cut.Cut` to learn more about using cuts.

    See also:

        - :class:`lhotse.cut.Cut`
        - :class:`lhotse.cut.MonoCut`
        - :class:`lhotse.cut.MixedCut`
        - :class:`lhotse.cut.CutSet`
    """

    id: str
    duration: Seconds
    sampling_rate: int
    feat_value: float

    # For frequency domain
    num_frames: Optional[int] = None
    num_features: Optional[int] = None
    frame_shift: Optional[float] = None

    # For time domain
    num_samples: Optional[int] = None
    video: Optional[VideoInfo] = None

    # Dict for storing padding values for custom array attributes
    custom: Optional[dict] = None

    @property
    def start(self) -> Seconds:
        return 0

    @property
    def supervisions(self):
        return []

    @property
    def channel(self) -> int:
        return 0

    @property
    def has_features(self) -> bool:
        return self.num_frames is not None

    @property
    def has_recording(self) -> bool:
        return self.num_samples is not None

    @property
    def has_video(self) -> bool:
        return self.has_recording and self.video is not None

    @property
    def num_channels(self) -> int:
        return 1


[docs]
    def has(self, field: str) -> bool:
        if field == "recording":
            return self.has_recording
        elif field == "features":
            return self.has_features
        elif field == "video":
            return self.has_video
        else:
            return self.custom is not None and field in self.custom


    @property
    def recording_id(self) -> str:
        return "PAD"

    # noinspection PyUnusedLocal

[docs]
    def load_features(self, *args, **kwargs) -> Optional[np.ndarray]:
        if self.has_features:
            return (
                np.ones((self.num_frames, self.num_features), np.float32)
                * self.feat_value
            )
        return None


    # noinspection PyUnusedLocal

[docs]
    def load_audio(self, *args, **kwargs) -> Optional[np.ndarray]:
        if self.has_recording:
            return np.zeros(
                (1, compute_num_samples(self.duration, self.sampling_rate)), np.float32
            )
        return None



[docs]
    def load_video(
        self,
        with_audio: bool = True,
    ) -> Optional[Tuple[torch.Tensor, Optional[torch.Tensor]]]:
        if self.has_video:
            audio = None
            if with_audio:
                audio = torch.zeros(
                    1,
                    compute_num_samples(self.duration, self.sampling_rate),
                    dtype=torch.float32,
                )
            return (
                torch.zeros(
                    self.video.num_frames,
                    3,
                    self.video.height,
                    self.video.width,
                    dtype=torch.uint8,
                ),
                audio,
            )
        return None


    # noinspection PyUnusedLocal

[docs]
    def truncate(
        self,
        *,
        offset: Seconds = 0.0,
        duration: Optional[Seconds] = None,
        keep_excessive_supervisions: bool = True,
        preserve_id: bool = False,
        **kwargs,
    ) -> "PaddingCut":
        new_duration = self.duration - offset if duration is None else duration
        assert new_duration > 0.0
        return fastcopy(
            self,
            id=self.id if preserve_id else str(uuid4()),
            duration=new_duration,
            feat_value=self.feat_value,
            num_frames=compute_num_frames(
                duration=new_duration,
                frame_shift=self.frame_shift,
                sampling_rate=self.sampling_rate,
            )
            if self.num_frames is not None
            else None,
            num_samples=compute_num_samples(
                duration=new_duration, sampling_rate=self.sampling_rate
            )
            if self.num_samples is not None
            else None,
        )


    # noinspection PyUnusedLocal

[docs]
    def extend_by(
        self,
        *,
        duration: Seconds,
        direction: str = "both",
        preserve_id: bool = False,
        pad_silence: bool = True,
    ) -> "PaddingCut":
        """
        Return a new PaddingCut with region extended by the specified duration.

        :param duration: The duration by which to extend the cut.
        :param direction: string, 'left', 'right' or 'both'. Determines whether the cut should
            be extended to the left, right or both sides. By default, the cut is extended by
            the specified duration on both sides.
        :param preserve_id: When ``True``, preserves the cut ID from before padding.
            Otherwise, generates a new random ID (default).
        :param pad_silence: See usage in :func:`lhotse.cut.MonoCut.extend_by`. It is ignored here.
        :return: an extended PaddingCut.
        """
        new_duration = self.duration + duration
        if direction == "both":
            new_duration += duration
        assert new_duration > 0.0
        return fastcopy(
            self,
            id=self.id if preserve_id else str(uuid4()),
            duration=new_duration,
            feat_value=self.feat_value,
            num_frames=compute_num_frames(
                duration=new_duration,
                frame_shift=self.frame_shift,
                sampling_rate=self.sampling_rate,
            )
            if self.num_frames is not None
            else None,
            num_samples=compute_num_samples(
                duration=new_duration, sampling_rate=self.sampling_rate
            )
            if self.num_samples is not None
            else None,
        )



[docs]
    def pad(
        self,
        duration: Seconds = None,
        num_frames: int = None,
        num_samples: int = None,
        pad_feat_value: float = LOG_EPSILON,
        direction: str = "right",
        preserve_id: bool = False,
        pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
    ) -> Cut:
        """
        Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.

        The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
        or a specific number of samples `num_samples`. The three arguments are mutually exclusive.

        :param duration: The cut's minimal duration after padding.
        :param num_frames: The cut's total number of frames after padding.
        :param num_samples: The cut's total number of samples after padding.
        :param pad_feat_value: A float value that's used for padding the features.
            By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
        :param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
            the cut.
        :param preserve_id: When ``True``, preserves the cut ID from before padding.
            Otherwise, generates a new random ID (default).
        :param pad_value_dict: Optional dict that specifies what value should be used
            for padding arrays in custom attributes.
        :return: a padded MixedCut if duration is greater than this cut's duration, otherwise ``self``.
        """
        from .set import pad

        return pad(
            self,
            duration=duration,
            num_frames=num_frames,
            num_samples=num_samples,
            pad_feat_value=pad_feat_value,
            direction=direction,
            preserve_id=preserve_id,
            pad_value_dict=pad_value_dict,
        )



[docs]
    def resample(self, sampling_rate: int, affix_id: bool = False) -> "PaddingCut":
        """
        Return a new ``MonoCut`` that will lazily resample the audio while reading it.
        This operation will drop the feature manifest, if attached.
        It does not affect the supervision.

        :param sampling_rate: The new sampling rate.
        :param affix_id: Should we modify the ID (useful if both versions of the same
            cut are going to be present in a single manifest).
        :return: a modified copy of the current ``MonoCut``.
        """
        assert self.has_recording, "Cannot resample a MonoCut without Recording."
        return fastcopy(
            self,
            id=f"{self.id}_rs{sampling_rate}" if affix_id else self.id,
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(self.duration, sampling_rate),
            num_frames=None,
            num_features=None,
            frame_shift=None,
        )



[docs]
    def perturb_speed(self, factor: float, affix_id: bool = True) -> "PaddingCut":
        """
        Return a new ``PaddingCut`` that will "mimic" the effect of speed perturbation
        on ``duration`` and ``num_samples``.

        :param factor: The speed will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
        :param affix_id: When true, we will modify the ``PaddingCut.id`` field
            by affixing it with "_sp{factor}".
        :return: a modified copy of the current ``PaddingCut``.
        """
        # Pre-conditions
        if self.has_features:
            logging.warning(
                "Attempting to perturb speed on a MonoCut that references pre-computed features. "
                "The feature manifest will be detached, as we do not support feature-domain "
                "speed perturbation."
            )
            new_num_frames = None
            new_num_features = None
            new_frame_shift = None
        else:
            new_num_frames = self.num_frames
            new_num_features = self.num_features
            new_frame_shift = self.frame_shift
        new_num_samples = perturb_num_samples(self.num_samples, factor)
        new_duration = new_num_samples / self.sampling_rate
        return fastcopy(
            self,
            id=f"{self.id}_sp{factor}" if affix_id else self.id,
            num_samples=new_num_samples,
            duration=new_duration,
            num_frames=new_num_frames,
            num_features=new_num_features,
            frame_shift=new_frame_shift,
        )



[docs]
    def perturb_tempo(self, factor: float, affix_id: bool = True) -> "PaddingCut":
        """
        Return a new ``PaddingCut`` that will "mimic" the effect of tempo perturbation
        on ``duration`` and ``num_samples``.

        Compared to speed perturbation, tempo preserves pitch.
        :param factor: The tempo will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
        :param affix_id: When true, we will modify the ``PaddingCut.id`` field
            by affixing it with "_tp{factor}".
        :return: a modified copy of the current ``PaddingCut``.
        """
        # Pre-conditions
        if self.has_features:
            logging.warning(
                "Attempting to perturb tempo on a MonoCut that references pre-computed features. "
                "The feature manifest will be detached, as we do not support feature-domain "
                "tempo perturbation."
            )
            new_num_frames = None
            new_num_features = None
            new_frame_shift = None
        else:
            new_num_frames = self.num_frames
            new_num_features = self.num_features
            new_frame_shift = self.frame_shift
        new_num_samples = perturb_num_samples(self.num_samples, factor)
        new_duration = new_num_samples / self.sampling_rate
        return fastcopy(
            self,
            id=f"{self.id}_tp{factor}" if affix_id else self.id,
            num_samples=new_num_samples,
            duration=new_duration,
            num_frames=new_num_frames,
            num_features=new_num_features,
            frame_shift=new_frame_shift,
        )



[docs]
    def perturb_volume(self, factor: float, affix_id: bool = True) -> "PaddingCut":
        """
        Return a new ``PaddingCut`` that will "mimic" the effect of volume perturbation
        on amplitude of samples.

        :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
        :param affix_id: When true, we will modify the ``PaddingCut.id`` field
            by affixing it with "_vp{factor}".
        :return: a modified copy of the current ``PaddingCut``.
        """

        return fastcopy(self, id=f"{self.id}_vp{factor}" if affix_id else self.id)



[docs]
    def reverb_rir(
        self,
        rir_recording: Optional["Recording"] = None,
        normalize_output: bool = True,
        early_only: bool = False,
        affix_id: bool = True,
        rir_channels: List[int] = [0],
        room_rng_seed: Optional[int] = None,
        source_rng_seed: Optional[int] = None,
    ) -> "PaddingCut":
        """
        Return a new ``PaddingCut`` that will "mimic" the effect of reverberation with impulse response
        on original samples.

        :param rir_recording: The impulse response to use for convolving.
        :param normalize_output: When true, output will be normalized to have energy as input.
        :param early_only: When true, only the early reflections (first 50 ms) will be used.
        :param affix_id: When true, we will modify the ``PaddingCut.id`` field
            by affixing it with "_rvb".
        :param rir_channels: The channels of the impulse response to use.
        :return: a modified copy of the current ``PaddingCut``.
        """

        return fastcopy(self, id=f"{self.id}_rvb" if affix_id else self.id)



[docs]
    def normalize_loudness(
        self, target: float, affix_id: bool = False, **kwargs
    ) -> "PaddingCut":
        """
        Return a new ``PaddingCut`` that will "mimic" the effect of loudness normalization

        :param target: The target loudness in dBFS.
        :param affix_id: When true, we will modify the ``DataCut.id`` field
            by affixing it with "_ln{target}".
        :return: a modified copy of the current ``DataCut``.
        """
        return fastcopy(
            self,
            id=f"{self.id}_ln{target}" if affix_id else self.id,
        )



[docs]
    def drop_features(self) -> "PaddingCut":
        """Return a copy of the current :class:`.PaddingCut`, detached from ``features``."""
        assert (
            self.has_recording
        ), f"Cannot detach features from a MonoCut with no Recording (cut ID = {self.id})."
        return fastcopy(self, num_frames=None, num_features=None, frame_shift=None)



[docs]
    def drop_recording(self) -> "PaddingCut":
        """Return a copy of the current :class:`.PaddingCut`, detached from ``recording``."""
        assert (
            self.has_features
        ), f"Cannot detach recording from a PaddingCut with no Features (cut ID = {self.id})."
        return fastcopy(self, num_samples=None)



[docs]
    def drop_supervisions(self) -> "PaddingCut":
        """Return a copy of the current :class:`.PaddingCut`, detached from ``supervisions``."""
        return self



[docs]
    def drop_alignments(self) -> "PaddingCut":
        """Return a copy of the current :class:`.PaddingCut`, detached from ``alignments``."""
        return self



[docs]
    def compute_and_store_features(
        self, extractor: FeatureExtractor, *args, **kwargs
    ) -> Cut:
        """
        Returns a new PaddingCut with updates information about the feature dimension and number of
        feature frames, depending on the ``extractor`` properties.
        """
        return fastcopy(
            self,
            num_features=extractor.feature_dim(self.sampling_rate),
            num_frames=compute_num_frames(
                duration=self.duration,
                frame_shift=extractor.frame_shift,
                sampling_rate=self.sampling_rate,
            ),
            frame_shift=extractor.frame_shift,
        )



[docs]
    def fill_supervision(self, *args, **kwargs) -> "PaddingCut":
        """
        Just for consistency with :class`.MonoCut` and :class:`.MixedCut`.
        """
        return self



[docs]
    def move_to_memory(self, *args, **kwargs) -> "PaddingCut":
        """
        Just for consistency with :class`.MonoCut` and :class:`.MixedCut`.
        """
        return self



[docs]
    def map_supervisions(self, transform_fn: Callable[[Any], Any]) -> "PaddingCut":
        """
        Just for consistency with :class:`.MonoCut` and :class:`.MixedCut`.

        :param transform_fn: a dummy function that would be never called actually.
        :return: the PaddingCut itself.
        """
        return self



[docs]
    def merge_supervisions(self, *args, **kwargs) -> "PaddingCut":
        """
        Just for consistency with :class:`.MonoCut` and :class:`.MixedCut`.

        :return: the PaddingCut itself.
        """
        return self



[docs]
    def filter_supervisions(
        self, predicate: Callable[[SupervisionSegment], bool]
    ) -> "PaddingCut":
        """
        Just for consistency with :class:`.MonoCut` and :class:`.MixedCut`.

        :param predicate: A callable that accepts `SupervisionSegment` and returns bool
        :return: a modified MonoCut
        """
        return self



[docs]
    @staticmethod
    def from_dict(data: dict) -> "PaddingCut":
        # Remove "type" field if exists
        data.pop("type", None)
        return PaddingCut(**data)



[docs]
    def with_features_path_prefix(self, path: Pathlike) -> "PaddingCut":
        return self



[docs]
    def with_recording_path_prefix(self, path: Pathlike) -> "PaddingCut":
        return self