Source code for lhotse.features.spectrogram

from dataclasses import asdict, dataclass
from typing import Any, Dict

import numpy as np

from lhotse.features.base import TorchaudioFeatureExtractor, register_extractor
from lhotse.utils import EPSILON, Seconds



[docs]
@dataclass
class TorchaudioSpectrogramConfig:
    # Note that `snip_edges` parameter is missing from config: in order to simplify the relationship between
    #  the duration and the number of frames, we are always setting `snip_edges` to False.
    dither: float = 0.0
    window_type: str = "povey"
    # Note that frame_length and frame_shift will be converted to milliseconds before torchaudio/Kaldi sees them
    frame_length: Seconds = 0.025
    frame_shift: Seconds = 0.01
    remove_dc_offset: bool = True
    round_to_power_of_two: bool = True
    energy_floor: float = EPSILON
    min_duration: float = 0.0
    preemphasis_coefficient: float = 0.97
    raw_energy: bool = True


[docs]
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)



[docs]
    @staticmethod
    def from_dict(data: Dict[str, Any]) -> "TorchaudioSpectrogramConfig":
        return TorchaudioSpectrogramConfig(**data)





[docs]
@register_extractor
class TorchaudioSpectrogram(TorchaudioFeatureExtractor):
    """Log spectrogram feature extractor based on ``torchaudio.compliance.kaldi.spectrogram`` function."""

    name = "spectrogram"
    config_type = TorchaudioSpectrogramConfig

    def _feature_fn(self, *args, **kwargs):
        from torchaudio.compliance.kaldi import spectrogram

        return spectrogram(*args, **kwargs)


[docs]
    def feature_dim(self, sampling_rate: int) -> int:
        from torchaudio.compliance.kaldi import _next_power_of_2

        window_size = int(self.config.frame_length * sampling_rate)
        return (
            _next_power_of_2(window_size) // 2 + 1
            if self.config.round_to_power_of_two
            else window_size
        )



[docs]
    @staticmethod
    def mix(
        features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
    ) -> np.ndarray:
        # Torchaudio returns log-power spectrum, hence the need for logsumexp
        return np.log(
            np.maximum(
                # protection against log(0); max with EPSILON is adequate since these are energies (always >= 0)
                EPSILON,
                np.exp(features_a) + energy_scaling_factor_b * np.exp(features_b),
            )
        )



[docs]
    @staticmethod
    def compute_energy(features: np.ndarray) -> float:
        # Torchaudio returns log-power spectrum, hence the need for exp before the sum
        return float(np.sum(np.exp(features)))