from dataclasses import asdict, dataclass
from typing import Any, Dict
from lhotse.features.base import TorchaudioFeatureExtractor, register_extractor
from lhotse.utils import EPSILON, Seconds
[docs]@dataclass
class MfccConfig:
# Spectogram-related part
dither: float = 0.0
window_type: str = "povey"
# Note that frame_length and frame_shift will be converted to milliseconds before torchaudio/Kaldi sees them
frame_length: Seconds = 0.025
frame_shift: Seconds = 0.01
remove_dc_offset: bool = True
round_to_power_of_two: bool = True
energy_floor: float = EPSILON
min_duration: float = 0.0
preemphasis_coefficient: float = 0.97
raw_energy: bool = True
# MFCC-related part
low_freq: float = 20.0
high_freq: float = -400.0
num_mel_bins: int = 23
use_energy: bool = False
vtln_low: float = 100.0
vtln_high: float = -500.0
vtln_warp: float = 1.0
cepstral_lifter: float = 22.0
num_ceps: int = 13
[docs] def to_dict(self) -> Dict[str, Any]:
return asdict(self)
[docs] @staticmethod
def from_dict(data: Dict[str, Any]) -> "MfccConfig":
return MfccConfig(**data)
[docs]@register_extractor
class Mfcc(TorchaudioFeatureExtractor):
"""MFCC feature extractor based on ``torchaudio.compliance.kaldi.mfcc`` function."""
name = "mfcc"
config_type = MfccConfig
def _feature_fn(self, *args, **kwargs):
from torchaudio.compliance.kaldi import mfcc
return mfcc(*args, **kwargs)
[docs] def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_ceps