import logging
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from lhotse.audio import Recording
from lhotse.audio.utils import VideoInfo
from lhotse.cut.base import Cut
from lhotse.features import FeatureExtractor
from lhotse.supervision import SupervisionSegment
from lhotse.utils import (
LOG_EPSILON,
Pathlike,
Seconds,
compute_num_frames,
compute_num_samples,
fastcopy,
perturb_num_samples,
uuid4,
)
[docs]
@dataclass
class PaddingCut(Cut):
"""
:class:`~lhotse.cut.PaddingCut` is a dummy :class:`~lhotse.cut.Cut` that doesn't refer to
actual recordings or features --it simply returns zero samples in the time domain
and a specified features value in the feature domain.
Its main role is to be appended to other cuts to make them evenly sized.
Please refer to the documentation of :class:`~lhotse.cut.Cut` to learn more about using cuts.
See also:
- :class:`lhotse.cut.Cut`
- :class:`lhotse.cut.MonoCut`
- :class:`lhotse.cut.MixedCut`
- :class:`lhotse.cut.CutSet`
"""
id: str
duration: Seconds
sampling_rate: int
feat_value: float
# For frequency domain
num_frames: Optional[int] = None
num_features: Optional[int] = None
frame_shift: Optional[float] = None
# For time domain
num_samples: Optional[int] = None
video: Optional[VideoInfo] = None
# Dict for storing padding values for custom array attributes
custom: Optional[dict] = None
@property
def start(self) -> Seconds:
return 0
@property
def supervisions(self):
return []
@property
def channel(self) -> int:
return 0
@property
def has_features(self) -> bool:
return self.num_frames is not None
@property
def has_recording(self) -> bool:
return self.num_samples is not None
@property
def has_video(self) -> bool:
return self.has_recording and self.video is not None
@property
def num_channels(self) -> int:
return 1
[docs]
def has(self, field: str) -> bool:
if field == "recording":
return self.has_recording
elif field == "features":
return self.has_features
elif field == "video":
return self.has_video
else:
return self.custom is not None and field in self.custom
@property
def recording_id(self) -> str:
return "PAD"
# noinspection PyUnusedLocal
[docs]
def load_features(self, *args, **kwargs) -> Optional[np.ndarray]:
if self.has_features:
return (
np.ones((self.num_frames, self.num_features), np.float32)
* self.feat_value
)
return None
# noinspection PyUnusedLocal
[docs]
def load_audio(self, *args, **kwargs) -> Optional[np.ndarray]:
if self.has_recording:
return np.zeros(
(1, compute_num_samples(self.duration, self.sampling_rate)), np.float32
)
return None
[docs]
def load_video(
self,
with_audio: bool = True,
) -> Optional[Tuple[torch.Tensor, Optional[torch.Tensor]]]:
if self.has_video:
audio = None
if with_audio:
audio = torch.zeros(
1,
compute_num_samples(self.duration, self.sampling_rate),
dtype=torch.float32,
)
return (
torch.zeros(
self.video.num_frames,
3,
self.video.height,
self.video.width,
dtype=torch.uint8,
),
audio,
)
return None
# noinspection PyUnusedLocal
[docs]
def truncate(
self,
*,
offset: Seconds = 0.0,
duration: Optional[Seconds] = None,
keep_excessive_supervisions: bool = True,
preserve_id: bool = False,
**kwargs,
) -> "PaddingCut":
new_duration = self.duration - offset if duration is None else duration
assert new_duration > 0.0
return fastcopy(
self,
id=self.id if preserve_id else str(uuid4()),
duration=new_duration,
feat_value=self.feat_value,
num_frames=compute_num_frames(
duration=new_duration,
frame_shift=self.frame_shift,
sampling_rate=self.sampling_rate,
)
if self.num_frames is not None
else None,
num_samples=compute_num_samples(
duration=new_duration, sampling_rate=self.sampling_rate
)
if self.num_samples is not None
else None,
)
# noinspection PyUnusedLocal
[docs]
def extend_by(
self,
*,
duration: Seconds,
direction: str = "both",
preserve_id: bool = False,
pad_silence: bool = True,
) -> "PaddingCut":
"""
Return a new PaddingCut with region extended by the specified duration.
:param duration: The duration by which to extend the cut.
:param direction: string, 'left', 'right' or 'both'. Determines whether the cut should
be extended to the left, right or both sides. By default, the cut is extended by
the specified duration on both sides.
:param preserve_id: When ``True``, preserves the cut ID from before padding.
Otherwise, generates a new random ID (default).
:param pad_silence: See usage in :func:`lhotse.cut.MonoCut.extend_by`. It is ignored here.
:return: an extended PaddingCut.
"""
new_duration = self.duration + duration
if direction == "both":
new_duration += duration
assert new_duration > 0.0
return fastcopy(
self,
id=self.id if preserve_id else str(uuid4()),
duration=new_duration,
feat_value=self.feat_value,
num_frames=compute_num_frames(
duration=new_duration,
frame_shift=self.frame_shift,
sampling_rate=self.sampling_rate,
)
if self.num_frames is not None
else None,
num_samples=compute_num_samples(
duration=new_duration, sampling_rate=self.sampling_rate
)
if self.num_samples is not None
else None,
)
[docs]
def pad(
self,
duration: Seconds = None,
num_frames: int = None,
num_samples: int = None,
pad_feat_value: float = LOG_EPSILON,
direction: str = "right",
preserve_id: bool = False,
pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
) -> Cut:
"""
Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
:param duration: The cut's minimal duration after padding.
:param num_frames: The cut's total number of frames after padding.
:param num_samples: The cut's total number of samples after padding.
:param pad_feat_value: A float value that's used for padding the features.
By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
:param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
the cut.
:param preserve_id: When ``True``, preserves the cut ID from before padding.
Otherwise, generates a new random ID (default).
:param pad_value_dict: Optional dict that specifies what value should be used
for padding arrays in custom attributes.
:return: a padded MixedCut if duration is greater than this cut's duration, otherwise ``self``.
"""
from .set import pad
return pad(
self,
duration=duration,
num_frames=num_frames,
num_samples=num_samples,
pad_feat_value=pad_feat_value,
direction=direction,
preserve_id=preserve_id,
pad_value_dict=pad_value_dict,
)
[docs]
def resample(self, sampling_rate: int, affix_id: bool = False) -> "PaddingCut":
"""
Return a new ``MonoCut`` that will lazily resample the audio while reading it.
This operation will drop the feature manifest, if attached.
It does not affect the supervision.
:param sampling_rate: The new sampling rate.
:param affix_id: Should we modify the ID (useful if both versions of the same
cut are going to be present in a single manifest).
:return: a modified copy of the current ``MonoCut``.
"""
assert self.has_recording, "Cannot resample a MonoCut without Recording."
return fastcopy(
self,
id=f"{self.id}_rs{sampling_rate}" if affix_id else self.id,
sampling_rate=sampling_rate,
num_samples=compute_num_samples(self.duration, sampling_rate),
num_frames=None,
num_features=None,
frame_shift=None,
)
[docs]
def perturb_speed(self, factor: float, affix_id: bool = True) -> "PaddingCut":
"""
Return a new ``PaddingCut`` that will "mimic" the effect of speed perturbation
on ``duration`` and ``num_samples``.
:param factor: The speed will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
:param affix_id: When true, we will modify the ``PaddingCut.id`` field
by affixing it with "_sp{factor}".
:return: a modified copy of the current ``PaddingCut``.
"""
# Pre-conditions
if self.has_features:
logging.warning(
"Attempting to perturb speed on a MonoCut that references pre-computed features. "
"The feature manifest will be detached, as we do not support feature-domain "
"speed perturbation."
)
new_num_frames = None
new_num_features = None
new_frame_shift = None
else:
new_num_frames = self.num_frames
new_num_features = self.num_features
new_frame_shift = self.frame_shift
new_num_samples = perturb_num_samples(self.num_samples, factor)
new_duration = new_num_samples / self.sampling_rate
return fastcopy(
self,
id=f"{self.id}_sp{factor}" if affix_id else self.id,
num_samples=new_num_samples,
duration=new_duration,
num_frames=new_num_frames,
num_features=new_num_features,
frame_shift=new_frame_shift,
)
[docs]
def perturb_tempo(self, factor: float, affix_id: bool = True) -> "PaddingCut":
"""
Return a new ``PaddingCut`` that will "mimic" the effect of tempo perturbation
on ``duration`` and ``num_samples``.
Compared to speed perturbation, tempo preserves pitch.
:param factor: The tempo will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
:param affix_id: When true, we will modify the ``PaddingCut.id`` field
by affixing it with "_tp{factor}".
:return: a modified copy of the current ``PaddingCut``.
"""
# Pre-conditions
if self.has_features:
logging.warning(
"Attempting to perturb tempo on a MonoCut that references pre-computed features. "
"The feature manifest will be detached, as we do not support feature-domain "
"tempo perturbation."
)
new_num_frames = None
new_num_features = None
new_frame_shift = None
else:
new_num_frames = self.num_frames
new_num_features = self.num_features
new_frame_shift = self.frame_shift
new_num_samples = perturb_num_samples(self.num_samples, factor)
new_duration = new_num_samples / self.sampling_rate
return fastcopy(
self,
id=f"{self.id}_tp{factor}" if affix_id else self.id,
num_samples=new_num_samples,
duration=new_duration,
num_frames=new_num_frames,
num_features=new_num_features,
frame_shift=new_frame_shift,
)
[docs]
def perturb_volume(self, factor: float, affix_id: bool = True) -> "PaddingCut":
"""
Return a new ``PaddingCut`` that will "mimic" the effect of volume perturbation
on amplitude of samples.
:param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
:param affix_id: When true, we will modify the ``PaddingCut.id`` field
by affixing it with "_vp{factor}".
:return: a modified copy of the current ``PaddingCut``.
"""
return fastcopy(self, id=f"{self.id}_vp{factor}" if affix_id else self.id)
[docs]
def reverb_rir(
self,
rir_recording: Optional["Recording"] = None,
normalize_output: bool = True,
early_only: bool = False,
affix_id: bool = True,
rir_channels: List[int] = [0],
room_rng_seed: Optional[int] = None,
source_rng_seed: Optional[int] = None,
) -> "PaddingCut":
"""
Return a new ``PaddingCut`` that will "mimic" the effect of reverberation with impulse response
on original samples.
:param rir_recording: The impulse response to use for convolving.
:param normalize_output: When true, output will be normalized to have energy as input.
:param early_only: When true, only the early reflections (first 50 ms) will be used.
:param affix_id: When true, we will modify the ``PaddingCut.id`` field
by affixing it with "_rvb".
:param rir_channels: The channels of the impulse response to use.
:return: a modified copy of the current ``PaddingCut``.
"""
return fastcopy(self, id=f"{self.id}_rvb" if affix_id else self.id)
[docs]
def normalize_loudness(
self, target: float, affix_id: bool = False, **kwargs
) -> "PaddingCut":
"""
Return a new ``PaddingCut`` that will "mimic" the effect of loudness normalization
:param target: The target loudness in dBFS.
:param affix_id: When true, we will modify the ``DataCut.id`` field
by affixing it with "_ln{target}".
:return: a modified copy of the current ``DataCut``.
"""
return fastcopy(
self,
id=f"{self.id}_ln{target}" if affix_id else self.id,
)
[docs]
def drop_features(self) -> "PaddingCut":
"""Return a copy of the current :class:`.PaddingCut`, detached from ``features``."""
assert (
self.has_recording
), f"Cannot detach features from a MonoCut with no Recording (cut ID = {self.id})."
return fastcopy(self, num_frames=None, num_features=None, frame_shift=None)
[docs]
def drop_recording(self) -> "PaddingCut":
"""Return a copy of the current :class:`.PaddingCut`, detached from ``recording``."""
assert (
self.has_features
), f"Cannot detach recording from a PaddingCut with no Features (cut ID = {self.id})."
return fastcopy(self, num_samples=None)
[docs]
def drop_supervisions(self) -> "PaddingCut":
"""Return a copy of the current :class:`.PaddingCut`, detached from ``supervisions``."""
return self
[docs]
def drop_alignments(self) -> "PaddingCut":
"""Return a copy of the current :class:`.PaddingCut`, detached from ``alignments``."""
return self
[docs]
def compute_and_store_features(
self, extractor: FeatureExtractor, *args, **kwargs
) -> Cut:
"""
Returns a new PaddingCut with updates information about the feature dimension and number of
feature frames, depending on the ``extractor`` properties.
"""
return fastcopy(
self,
num_features=extractor.feature_dim(self.sampling_rate),
num_frames=compute_num_frames(
duration=self.duration,
frame_shift=extractor.frame_shift,
sampling_rate=self.sampling_rate,
),
frame_shift=extractor.frame_shift,
)
[docs]
def fill_supervision(self, *args, **kwargs) -> "PaddingCut":
"""
Just for consistency with :class`.MonoCut` and :class:`.MixedCut`.
"""
return self
[docs]
def move_to_memory(self, *args, **kwargs) -> "PaddingCut":
"""
Just for consistency with :class`.MonoCut` and :class:`.MixedCut`.
"""
return self
[docs]
def map_supervisions(self, transform_fn: Callable[[Any], Any]) -> "PaddingCut":
"""
Just for consistency with :class:`.MonoCut` and :class:`.MixedCut`.
:param transform_fn: a dummy function that would be never called actually.
:return: the PaddingCut itself.
"""
return self
[docs]
def merge_supervisions(self, *args, **kwargs) -> "PaddingCut":
"""
Just for consistency with :class:`.MonoCut` and :class:`.MixedCut`.
:return: the PaddingCut itself.
"""
return self
[docs]
def filter_supervisions(
self, predicate: Callable[[SupervisionSegment], bool]
) -> "PaddingCut":
"""
Just for consistency with :class:`.MonoCut` and :class:`.MixedCut`.
:param predicate: A callable that accepts `SupervisionSegment` and returns bool
:return: a modified MonoCut
"""
return self
[docs]
@staticmethod
def from_dict(data: dict) -> "PaddingCut":
# Remove "type" field if exists
data.pop("type", None)
return PaddingCut(**data)
[docs]
def with_features_path_prefix(self, path: Pathlike) -> "PaddingCut":
return self
[docs]
def with_recording_path_prefix(self, path: Pathlike) -> "PaddingCut":
return self