Source code for lhotse.features.librosa_fbank

from dataclasses import asdict, dataclass
from typing import Any, Dict

import numpy as np

from lhotse.features.base import FeatureExtractor, register_extractor
from lhotse.utils import (
    EPSILON,
    LOG_EPSILON,
    Seconds,
    compute_num_frames,
    is_module_available,
)


[docs]@dataclass
class LibrosaFbankConfig:
    """Default librosa config with values consistent with various TTS projects.

    This config is intended for use with popular TTS projects such as [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
    Warning: You may need to normalize your features.
    """

    sampling_rate: int = 22050
    fft_size: int = 1024
    hop_size: int = 256
    win_length: int = None
    window: str = "hann"
    num_mel_bins: int = 80
    fmin: int = 80
    fmax: int = 7600

[docs]    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)

[docs]    @staticmethod
    def from_dict(data: Dict[str, Any]) -> "LibrosaFbankConfig":
        return LibrosaFbankConfig(**data)


[docs]def pad_or_truncate_features(
    feats: np.ndarray,
    expected_num_frames: int,
    abs_tol: int = 1,
    pad_value: float = LOG_EPSILON,
):
    frames_diff = feats.shape[0] - expected_num_frames

    if 0 < frames_diff <= abs_tol:
        feats = feats[:expected_num_frames]
    elif -abs_tol <= frames_diff < 0:
        feats = np.pad(
            feats,
            ((0, -frames_diff), (0, 0)),
            mode="constant",
            constant_values=LOG_EPSILON,
        )
    elif abs(frames_diff) > abs_tol:
        raise ValueError(
            f"Expected {expected_num_frames} source_feats; feats.shape[0] = {feats.shape[0]}"
        )

    return feats


[docs]def logmelfilterbank(
    audio: np.ndarray,
    sampling_rate: int,
    fft_size: int = 1024,
    hop_size: int = 256,
    win_length: int = None,
    window: str = "hann",
    num_mel_bins: int = 80,
    fmin: int = 80,
    fmax: int = 7600,
    eps: float = EPSILON,
):
    """Compute log-Mel filterbank feature.

    Args:
        audio (ndarray): Audio signal (T,).
        sampling_rate (int): Sampling rate.
        fft_size (int): FFT size.
        hop_size (int): Hop size.
        win_length (int): Window length. If set to None, it will be the same as fft_size.
        window (str): Window function type.
        num_mel_bins (int): Number of mel basis.
        fmin (int): Minimum frequency in mel basis calculation.
        fmax (int): Maximum frequency in mel basis calculation.
        eps (float): Epsilon value to avoid inf in log calculation.
    Returns:
        ndarray: Log Mel filterbank feature (#source_feats, num_mel_bins).
    """
    if is_module_available("librosa"):
        import librosa
    else:
        raise ImportError(
            "Librosa is not installed. Please install librosa before using LibrosaFbank extractor."
        )

    if len(audio.shape) == 2:
        assert (
            audio.shape[0] == 1
        ), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})"
        audio = audio[0]
    else:
        assert (
            len(audio.shape) == 1
        ), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})"

    x_stft = librosa.stft(
        audio,
        n_fft=fft_size,
        hop_length=hop_size,
        win_length=win_length,
        window=window,
        pad_mode="reflect",
    )
    spc = np.abs(x_stft).T

    fmin = 0 if fmin is None else fmin
    fmax = sampling_rate / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(
        sr=sampling_rate, n_fft=fft_size, n_mels=num_mel_bins, fmin=fmin, fmax=fmax
    )

    feats = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

    expected_num_frames = compute_num_frames(
        duration=len(audio) / sampling_rate,
        frame_shift=hop_size / sampling_rate,
        sampling_rate=sampling_rate,
    )
    feats = pad_or_truncate_features(feats, expected_num_frames)
    return feats


[docs]@register_extractor
class LibrosaFbank(FeatureExtractor):
    """Librosa fbank feature extractor

    Differs from Fbank extractor in that it uses librosa backend for stft and mel scale calculations.
    It can be easily configured to be compatible with existing speech-related projects that use librosa features.
    """

    name = "librosa-fbank"
    config_type = LibrosaFbankConfig

    @property
    def frame_shift(self) -> Seconds:
        return self.config.hop_size / self.config.sampling_rate

[docs]    def feature_dim(self, sampling_rate: int) -> int:
        return self.config.num_mel_bins

[docs]    def extract(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
        assert sampling_rate == self.config.sampling_rate
        return logmelfilterbank(samples, **asdict(self.config))

[docs]    @staticmethod
    def mix(
        features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
    ) -> np.ndarray:
        return np.log(
            np.maximum(
                # protection against log(0); max with EPSILON is adequate since these are energies (always >= 0)
                EPSILON,
                np.exp(features_a) + energy_scaling_factor_b * np.exp(features_b),
            )
        )

[docs]    @staticmethod
    def compute_energy(features: np.ndarray) -> float:
        return float(np.sum(np.exp(features)))

[docs]    @staticmethod
    def scale(features: np.ndarray, energy_scaling_factor: float) -> np.ndarray:
        return features + np.log(energy_scaling_factor)