from dataclasses import asdict, dataclass
from typing import Any, Dict
import numpy as np
from lhotse.features.base import FeatureExtractor, register_extractor
from lhotse.utils import (
EPSILON,
LOG_EPSILON,
Seconds,
compute_num_frames,
is_module_available,
)
[docs]
@dataclass
class LibrosaFbankConfig:
"""Default librosa config with values consistent with various TTS projects.
This config is intended for use with popular TTS projects such as [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
Warning: You may need to normalize your features.
"""
sampling_rate: int = 22050
fft_size: int = 1024
hop_size: int = 256
win_length: int = None
window: str = "hann"
num_mel_bins: int = 80
fmin: int = 80
fmax: int = 7600
[docs]
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
[docs]
@staticmethod
def from_dict(data: Dict[str, Any]) -> "LibrosaFbankConfig":
return LibrosaFbankConfig(**data)
[docs]
def pad_or_truncate_features(
feats: np.ndarray,
expected_num_frames: int,
abs_tol: int = 1,
pad_value: float = LOG_EPSILON,
):
frames_diff = feats.shape[0] - expected_num_frames
if 0 < frames_diff <= abs_tol:
feats = feats[:expected_num_frames]
elif -abs_tol <= frames_diff < 0:
feats = np.pad(
feats,
((0, -frames_diff), (0, 0)),
mode="constant",
constant_values=LOG_EPSILON,
)
elif abs(frames_diff) > abs_tol:
raise ValueError(
f"Expected {expected_num_frames} source_feats; feats.shape[0] = {feats.shape[0]}"
)
return feats
[docs]
def logmelfilterbank(
audio: np.ndarray,
sampling_rate: int,
fft_size: int = 1024,
hop_size: int = 256,
win_length: int = None,
window: str = "hann",
num_mel_bins: int = 80,
fmin: int = 80,
fmax: int = 7600,
eps: float = EPSILON,
):
"""Compute log-Mel filterbank feature.
Args:
audio (ndarray): Audio signal (T,).
sampling_rate (int): Sampling rate.
fft_size (int): FFT size.
hop_size (int): Hop size.
win_length (int): Window length. If set to None, it will be the same as fft_size.
window (str): Window function type.
num_mel_bins (int): Number of mel basis.
fmin (int): Minimum frequency in mel basis calculation.
fmax (int): Maximum frequency in mel basis calculation.
eps (float): Epsilon value to avoid inf in log calculation.
Returns:
ndarray: Log Mel filterbank feature (#source_feats, num_mel_bins).
"""
if is_module_available("librosa"):
import librosa
else:
raise ImportError(
"Librosa is not installed. Please install librosa before using LibrosaFbank extractor."
)
if len(audio.shape) == 2:
assert (
audio.shape[0] == 1
), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})"
audio = audio[0]
else:
assert (
len(audio.shape) == 1
), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})"
x_stft = librosa.stft(
audio,
n_fft=fft_size,
hop_length=hop_size,
win_length=win_length,
window=window,
pad_mode="reflect",
)
spc = np.abs(x_stft).T
fmin = 0 if fmin is None else fmin
fmax = sampling_rate / 2 if fmax is None else fmax
mel_basis = librosa.filters.mel(
sr=sampling_rate, n_fft=fft_size, n_mels=num_mel_bins, fmin=fmin, fmax=fmax
)
feats = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
expected_num_frames = compute_num_frames(
duration=len(audio) / sampling_rate,
frame_shift=hop_size / sampling_rate,
sampling_rate=sampling_rate,
)
feats = pad_or_truncate_features(feats, expected_num_frames)
return feats
[docs]
@register_extractor
class LibrosaFbank(FeatureExtractor):
"""Librosa fbank feature extractor
Differs from Fbank extractor in that it uses librosa backend for stft and mel scale calculations.
It can be easily configured to be compatible with existing speech-related projects that use librosa features.
"""
name = "librosa-fbank"
config_type = LibrosaFbankConfig
@property
def frame_shift(self) -> Seconds:
return self.config.hop_size / self.config.sampling_rate
[docs]
def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_mel_bins
[docs]
@staticmethod
def mix(
features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
) -> np.ndarray:
return np.log(
np.maximum(
# protection against log(0); max with EPSILON is adequate since these are energies (always >= 0)
EPSILON,
np.exp(features_a) + energy_scaling_factor_b * np.exp(features_b),
)
)
[docs]
@staticmethod
def compute_energy(features: np.ndarray) -> float:
return float(np.sum(np.exp(features)))