Source code for dcase_util.features.features

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, absolute_import

import numpy
import librosa
import scipy
import logging
import importlib
from dcase_util.containers import ContainerMixin
from dcase_util.ui import FancyStringifier, FancyHTMLStringifier
from dcase_util.utils import setup_logging, get_class_inheritors, is_jupyter


def feature_extractor_list(display=True):
    """List of feature extractors available

    Parameters
    ----------
    display : bool
        Display list immediately, otherwise return string
        Default value True

    Returns
    -------
    str
        Multi line string containing extractor table

    """

    class_list = get_class_inheritors(FeatureExtractor)
    class_list.sort(key=lambda x: x.__name__, reverse=False)
    class_names = []
    labels = []
    descriptions = []
    for extractor_class in class_list:
        if not extractor_class.__name__.endswith('Processor'):
            e = extractor_class()
            class_names.append(extractor_class.__name__)
            labels.append(e.label)
            descriptions.append(e.description)

    if is_jupyter():
        ui = FancyHTMLStringifier()

    else:
        ui = FancyStringifier()

    output = ui.table(
        cell_data=[class_names, labels, descriptions],
        column_headers=['Class name', 'Feature label', 'Description'],
        column_types=['str30', 'str20', 'str50'],
        column_separators=[0, 1]
    )

    if display:
        if is_jupyter():
            from IPython.core.display import display, HTML
            display(HTML(output))

        else:
            print(output)

    else:
        return output


def feature_extractor_factory(feature_extractor_label, **kwargs):
    """Function to get correct feature extractor class instance based on extractor label or class name.

    Parameters
    ----------
    feature_extractor_label : str
        Class name or extractor label

    Raises
    ------
    NameError
        Class does not exists

    Returns
    -------
    Feature extractor class instance

    """

    try:
        feature_extractor_class = None

        # Get all classes inherited from FeatureExtractor
        class_list = get_class_inheritors(FeatureExtractor)

        # Search correct feature extractor
        for item in class_list:
            if str(item.__name__) == feature_extractor_label:
                feature_extractor_class = getattr(
                    importlib.import_module(str(item.__module__)),
                    feature_extractor_label
                )
                break

            elif hasattr(item, 'label') and item.label == feature_extractor_label and item.__name__.endswith('Extractor'):
                feature_extractor_class = getattr(
                    importlib.import_module(str(item.__module__)),
                    item.__name__
                )
                break

        # Valid feature extractor class not found, raise error
        if not feature_extractor_class:
            raise AttributeError

    except AttributeError:

        message = 'Invalid FeatureExtractor class name or extractor label given [{label}].'.format(
            label=feature_extractor_label
        )
        logger = logging.getLogger(__name__)
        if not logger.handlers:
            setup_logging()

        logger.exception(message)
        raise AttributeError(message)

    return feature_extractor_class(**dict(kwargs))


[docs]class FeatureExtractor(ContainerMixin):
    """Feature extractor base class"""
    label = 'extractor_base'  #: Extractor label
    description = 'Feature extractor base class' #: Extractor description

[docs]    def __init__(self, fs=44100,
                 win_length_samples=None, hop_length_samples=None,
                 win_length_seconds=0.04, hop_length_seconds=0.02, **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal

        win_length_samples : int
            Window length in samples

        hop_length_samples : int
            Hop length in samples

        win_length_seconds : float
            Window length in seconds

        hop_length_seconds : float
            Hop length in seconds

        """

        # Run ContainerMixin init
        ContainerMixin.__init__(self, **kwargs)

        self.eps = numpy.spacing(1)
        if fs is not None:
            self.fs = fs

        else:
            message = '{name}: No fs set'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ValueError(message)

        self.win_length_samples = win_length_samples
        self.hop_length_samples = hop_length_samples

        self.win_length_seconds = win_length_seconds
        self.hop_length_seconds = hop_length_seconds

        if not self.win_length_samples and self.win_length_seconds and self.fs:
            self.win_length_samples = int(self.fs * self.win_length_seconds)

        if not self.hop_length_samples and self.hop_length_seconds and self.fs:
            self.hop_length_samples = int(self.fs * self.hop_length_seconds)

        if self.win_length_samples is None:
            message = '{name}: No win_length_samples set'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ValueError(message)

        if self.hop_length_samples is None:
            message = '{name}: No hop_length_samples set'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ValueError(message)

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = ''
        output += ui.class_name(self.__class__.__name__, indent=indent) + '\n'

        if hasattr(self, 'filename') and self.filename:
            output += FancyStringifier().data(field='filename', value=self.filename, indent=indent) + '\n'

        output += ui.data(field='fs', value=self.fs, indent=indent) + '\n'
        output += ui.line(field='Frame blocking', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='hop_length_samples', value=self.hop_length_samples) + '\n'
        output += ui.data(indent=indent + 2, field='hop_length_seconds', value=self.hop_length_seconds, unit='sec') + '\n'

        output += ui.data(indent=indent + 2, field='win_length_samples', value=self.win_length_samples) + '\n'
        output += ui.data(indent=indent + 2, field='win_length_seconds', value=self.win_length_seconds, unit='sec') + '\n'

        return output

    def __getstate__(self):
        # Return only needed data for pickle
        return {
            'eps': self.eps,
            'fs': self.fs,
            'win_length_samples': self.win_length_samples,
            'hop_length_samples': self.hop_length_samples,
            'win_length_seconds': self.win_length_seconds,
            'hop_length_seconds': self.hop_length_seconds
        }

    def __setstate__(self, d):
        self.eps = d['eps']
        self.fs = d['fs']
        self.win_length_samples = d['win_length_samples']
        self.hop_length_samples = d['hop_length_samples']
        self.win_length_seconds = d['win_length_seconds']
        self.hop_length_seconds = d['hop_length_seconds']

    def __call__(self, *args, **kwargs):
        return self.extract(*args, **kwargs)

    def extract(self, y):
        """Extract features for the audio signal (PLACEHOLDER).

        Parameters
        ----------
        y : AudioContainer or numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        None

        """

        pass


[docs]class SpectralFeatureExtractor(FeatureExtractor):
    """Spectral feature extractor base class"""
    label = 'spectrogram'  #: Extractor label
    description = 'Spectral feature extractor base class (Librosa)'  #: Extractor description

[docs]    def __init__(self, spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric', **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.

        win_length_samples : int
            Window length in samples.

        hop_length_samples : int
            Hop length in samples.

        win_length_seconds : float
            Window length in seconds.

        hop_length_seconds : float
            Hop length in seconds.

        spectrogram_type : str
            Spectrogram type, magnitude or power spectrogram.
            Default value 'magnitude'

        n_fft : int
            Length of the FFT window.
            Default value 2048

        window_type : str
            Window function type.
            Default value 'hamming_asymmetric'

        """

        super(SpectralFeatureExtractor, self).__init__(**kwargs)

        # Run FeatureExtractor init
        FeatureExtractor.__init__(self, **kwargs)

        self.spectrogram_type = spectrogram_type
        self.n_fft = n_fft
        self.window_type = window_type

        self.window = self.get_window_function(
            n=self.win_length_samples,
            window_type=self.window_type
        )

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(SpectralFeatureExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='Spectrogram', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='spectrogram_type', value=self.spectrogram_type) + '\n'
        output += ui.data(indent=indent + 2, field='n_fft', value=self.n_fft) + '\n'
        output += ui.data(indent=indent + 2, field='window_type', value=self.window_type) + '\n'

        return output

[docs]    def get_window_function(self, n, window_type='hamming_asymmetric'):
        """Window function

        Parameters
        ----------
        n : int
            window length

        window_type : str
            window type
            Default value 'hamming_asymmetric'

        Raises
        ------
        ValueError:
            Unknown window type

        Returns
        -------
        numpy.array
            window function

        """

        # Windowing function
        if window_type == 'hamming_asymmetric':
            return scipy.signal.hamming(n, sym=False)

        elif window_type == 'hamming_symmetric' or window_type == 'hamming':
            return scipy.signal.hamming(n, sym=True)

        elif window_type == 'hann_asymmetric':
            return scipy.signal.hann(n, sym=False)

        elif window_type == 'hann_symmetric' or window_type == 'hann':
            return scipy.signal.hann(n, sym=True)

        else:
            message = '{name}: Unknown window type [{window_type}]'.format(
                name=self.__class__.__name__,
                window_type=window_type
            )

            self.logger.exception(message)
            raise ValueError(message)

[docs]    def get_spectrogram(self, y, n_fft=None, win_length_samples=None, hop_length_samples=None,
                        window=None, center=True, spectrogram_type=None):
        """Spectrogram

        Parameters
        ----------
        y : numpy.ndarray
            Audio data

        n_fft : int
            FFT size
            Default value 2048

        win_length_samples : int
            Window length in samples
            Default value None

        hop_length_samples : int
            Hop length in samples
            Default value None

        window : numpy.array
            Window function
            Default value None

        center : bool
            If true, input signal is padded so to the frame is centered at hop length
            Default value True

        spectrogram_type : str
            Type of spectrogram "magnitude" or "power"
            Default value None

        Returns
        -------
        numpy.ndarray [shape=(1 + n_fft/2, t), dtype=dtype]
            STFT matrix

        """

        if n_fft is None:
            n_fft = self.n_fft

        if win_length_samples is None:
            win_length_samples = self.win_length_samples

        if hop_length_samples is None:
            hop_length_samples = self.hop_length_samples

        if window is None and self.window is not None:
            window = self.window

        if spectrogram_type is None:
            spectrogram_type = self.spectrogram_type

        from dcase_util.containers import AudioContainer

        if isinstance(y, AudioContainer):
            if y.channels == 1:
                y = y.data

            else:
                message = '{name}: Input has more than one audio channel.'.format(
                    name=self.__class__.__name__
                )

                self.logger.exception(message)
                raise ValueError(message)

        if spectrogram_type == 'magnitude':
            return numpy.abs(librosa.stft(y + self.eps,
                                          n_fft=n_fft,
                                          win_length=win_length_samples,
                                          hop_length=hop_length_samples,
                                          center=center,
                                          window=window
                                          )
                             )
        elif spectrogram_type == 'power':
            return numpy.abs(librosa.stft(y + self.eps,
                                          n_fft=n_fft,
                                          win_length=win_length_samples,
                                          hop_length=hop_length_samples,
                                          center=center,
                                          window=window
                                          )) ** 2
        else:
            message = '{name}: Unknown spectrum type [{spectrogram_type}]'.format(
                name=self.__class__.__name__,
                spectrogram_type=spectrogram_type
            )

            self.logger.exception(message)
            raise ValueError(message)

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : AudioContainer or numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(n_fft, t)]
            spectrum
        """

        return self.get_spectrogram(
            y=y,
            n_fft=self.n_fft,
            win_length_samples=self.win_length_samples,
            hop_length_samples=self.hop_length_samples,
            spectrogram_type=self.spectrogram_type,
            center=True,
            window=self.window
        )


[docs]class MelExtractor(SpectralFeatureExtractor):
    """Feature extractor class to extract mel band energy features"""
    label = 'mel'  #: Extractor label
    description = 'Mel band energy (Librosa)'  #: Extractor description

[docs]    def __init__(self,
                 fs=44100,
                 win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
                 spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
                 n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False, logarithmic=True,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.

        win_length_samples : int
            Window length in samples.
            Default value None

        hop_length_samples : int
            Hop length in samples.
            Default value None

        win_length_seconds : float
            Window length in seconds.
            Default value 0.04

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        spectrogram_type : str
            Spectrogram type, magnitude or power spectrogram.
            Default value 'magnitude'

        n_fft : int
            Length of the FFT window.
            Default value 2048

        window_type : str
            Window function type.
            Default value 'hamming_asymmetric'

        n_mels : int
            Number of mel bands to generate
            Default value 40

        fmin : int
            Lowest frequency in mel bands (in Hz)
            Default value 0

        fmax : int
            Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0
            Default value None

        normalize_mel_bands : bool
            Normalize mel band to have peak at 1.0
            Default value False

        htk : bool
            Use HTK formula for mel band creation instead of Slaney
            Default value False

        logarithmic : bool
            Switch for log mel-band energies
            Default value True

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': win_length_samples,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': win_length_seconds,
            'hop_length_seconds': hop_length_seconds,
            'spectrogram_type': spectrogram_type,
            'n_fft': n_fft,
            'window_type': window_type
        })

        super(MelExtractor, self).__init__(**kwargs)

        # Run SpectralFeatureExtractor init
        SpectralFeatureExtractor.__init__(self, **kwargs)

        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        self.normalize_mel_bands = normalize_mel_bands
        self.htk = htk
        self.logarithmic = logarithmic

        self.mel_basis = librosa.filters.mel(
            sr=self.fs,
            n_fft=self.n_fft,
            n_mels=self.n_mels,
            fmin=self.fmin,
            fmax=self.fmax,
            htk=self.htk
        )

        if self.normalize_mel_bands:
            self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(MelExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='Mel', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='n_mels', value=self.n_mels) + '\n'
        output += ui.data(indent=indent + 2, field='fmin', value=self.fmin) + '\n'
        output += ui.data(indent=indent + 2, field='fmax', value=self.fmax if self.fmax is not None else 'None') + '\n'
        output += ui.data(indent=indent + 2, field='normalize_mel_bands', value=self.normalize_mel_bands) + '\n'
        output += ui.data(indent=indent + 2, field='htk', value=self.htk) + '\n'
        output += ui.data(indent=indent + 2, field='logarithmic', value=self.logarithmic) + '\n'

        return output

    def __getstate__(self):
        d = super(MelExtractor, self).__getstate__()
        d.update({
            'n_mels': self.n_mels,
            'fmin': self.fmin,
            'fmax': self.fmax,
            'normalize_mel_bands': self.normalize_mel_bands,
            'htk': self.htk,
            'logarithmic': self.logarithmic,
        })

        return d

    def __setstate__(self, d):
        super(MelExtractor, self).__setstate__(d)

        self.n_mels = d['n_mels']
        self.fmin = d['fmin']
        self.fmax = d['fmax']
        self.normalize_mel_bands = d['normalize_mel_bands']
        self.htk = d['htk']
        self.logarithmic = d['logarithmic']

        self.mel_basis = librosa.filters.mel(
            sr=self.fs,
            n_fft=self.n_fft,
            n_mels=self.n_mels,
            fmin=self.fmin,
            fmax=self.fmax,
            htk=self.htk
        )

        if self.normalize_mel_bands:
            self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : AudioContainer or numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(n_mels, t)]
            mel band energies
        """

        spectrogram = self.get_spectrogram(
            y=y,
            n_fft=self.n_fft,
            win_length_samples=self.win_length_samples,
            hop_length_samples=self.hop_length_samples,
            spectrogram_type=self.spectrogram_type,
            center=True,
            window=self.window
        )
        mel_spectrum = numpy.dot(self.mel_basis, spectrogram)

        if self.logarithmic:
            mel_spectrum = numpy.log(mel_spectrum + self.eps)

        return mel_spectrum


[docs]class MfccStaticExtractor(SpectralFeatureExtractor):
    """Feature extractor class to extract static MFCC features"""
    label = 'mfcc'  #: Extractor label
    description = 'MFCC (Librosa)'  #: Extractor description

[docs]    def __init__(self,
                 fs=44100,
                 win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
                 spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
                 n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False,
                 n_mfcc=20, omit_zeroth=False,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.
            Default value 44100

        win_length_samples : int
            Window length in samples.
            Default value None

        hop_length_samples : int
            Hop length in samples.
            Default value None

        win_length_seconds : float
            Window length in seconds.
            Default value 0.04

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        spectrogram_type : str
            Spectrogram type, magnitude or power spectrogram.
            Default value 'magnitude'

        n_fft : int
            Length of the FFT window.
            Default value 2048

        window_type : str
            Window function type.
            Default value 'hamming_asymmetric'

        n_mels : int
            Number of mel bands to generate.
            Default value 40

        fmin : int
            Lowest frequency in mel bands (in Hz).
            Default value 0

        fmax : int
            Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0
            Default value None

        normalize_mel_bands : bool
            Normalize mel band to have peak at 1.0
            Default value False

        htk : bool
            Use HTK formula for mel band creation instead of Slaney
            Default value False

        n_mfcc : int
            Number of MFCC coefficients
            Default value 20

        omit_zeroth : bool
            Omit 0th coefficient
            Default value False

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': win_length_samples,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': win_length_seconds,
            'hop_length_seconds': hop_length_seconds,
            'spectrogram_type': spectrogram_type,
            'n_fft': n_fft,
            'window_type': window_type
        })

        super(MfccStaticExtractor, self).__init__(**kwargs)

        # Run SpectralFeatureExtractor init
        SpectralFeatureExtractor.__init__(self, **kwargs)

        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        self.normalize_mel_bands = normalize_mel_bands
        self.htk = htk
        self.n_mfcc = n_mfcc
        self.omit_zeroth = omit_zeroth

        self.mel_basis = librosa.filters.mel(
            sr=self.fs,
            n_fft=self.n_fft,
            n_mels=self.n_mels,
            fmin=self.fmin,
            fmax=self.fmax,
            htk=self.htk
        )

        if self.normalize_mel_bands:
            self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(MfccStaticExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='MFCC', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='n_mels', value=self.n_mels) + '\n'
        output += ui.data(indent=indent + 2, field='fmin', value=self.fmin) + '\n'
        output += ui.data(indent=indent + 2, field='fmax', value=self.fmax) + '\n'
        output += ui.data(indent=indent + 2, field='normalize_mel_bands', value=self.normalize_mel_bands) + '\n'
        output += ui.data(indent=indent + 2, field='htk', value=self.htk) + '\n'
        output += ui.data(indent=indent + 2, field='n_mfcc', value=self.n_mfcc) + '\n'

        return output

    def __getstate__(self):
        d = super(MfccStaticExtractor, self).__getstate__()
        d.update({
            'n_mels': self.n_mels,
            'fmin': self.fmin,
            'fmax': self.fmax,
            'normalize_mel_bands': self.normalize_mel_bands,
            'htk': self.htk,
            'n_mfcc': self.n_mfcc,
            'omit_zeroth': self.omit_zeroth,
        })

        return d

    def __setstate__(self, d):
        super(MfccStaticExtractor, self).__setstate__(d)

        self.n_mels = d['n_mels']
        self.fmin = d['fmin']
        self.fmax = d['fmax']
        self.normalize_mel_bands = d['normalize_mel_bands']
        self.htk = d['htk']
        self.n_mfcc = d['n_mfcc']
        self.omit_zeroth = d['omit_zeroth']

        self.mel_basis = librosa.filters.mel(
            sr=self.fs,
            n_fft=self.n_fft,
            n_mels=self.n_mels,
            fmin=self.fmin,
            fmax=self.fmax,
            htk=self.htk
        )

        if self.normalize_mel_bands:
            self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(n_mels, t)]
            mfccs

        """

        spectrogram = self.get_spectrogram(
            y=y,
            n_fft=self.n_fft,
            win_length_samples=self.win_length_samples,
            hop_length_samples=self.hop_length_samples,
            spectrogram_type=self.spectrogram_type,
            center=True,
            window=self.window
        )

        mel_spectrum = numpy.dot(self.mel_basis, spectrogram)
        mfccs = librosa.feature.mfcc(
            S=librosa.power_to_db(mel_spectrum),
            n_mfcc=self.n_mfcc
        )

        if self.omit_zeroth:
            # Remove first coefficient
            mfccs = mfccs[1:, :]

        return mfccs


[docs]class MfccDeltaExtractor(MfccStaticExtractor):
    """Feature extractor class to extract MFCC delta features"""
    label = 'mfcc_delta'  #: Extractor label
    description = 'MFCC delta (Librosa)'  #: Extractor description

[docs]    def __init__(self,
                 fs=44100,
                 win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
                 spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
                 n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False,
                 n_mfcc=20, omit_zeroth=False,
                 width=9,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.
            Default value 44100

        win_length_samples : int
            Window length in samples.
            Default value None

        hop_length_samples : int
            Hop length in samples.
            Default value None

        win_length_seconds : float
            Window length in seconds.
            Default value 0.04

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        spectrogram_type : str
            Spectrogram type, magnitude or power spectrogram.
            Default value 'magnitude'

        n_fft : int
            Length of the FFT window.
            Default value 2048

        window_type : str
            Window function type.
            Default value 'hamming_asymmetric'

        n_mels : int
            Number of mel bands to generate.
            Default value 40

        fmin : int
            Lowest frequency in mel bands (in Hz).
            Default value 0

        fmax : int
            Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0.
            Default value None

        normalize_mel_bands : bool
            Normalize mel band to have peak at 1.0.
            Default value False

        htk : bool
            Use HTK formula for mel band creation instead of Slaney.
            Default value False

        n_mfcc : int
            Number of MFCC coefficients.
            Default value 20

        omit_zeroth : bool
            Omit 0th coefficient.
            Default value False

        width : int
            Width of the delta window.
            Default value 9

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': win_length_samples,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': win_length_seconds,
            'hop_length_seconds': hop_length_seconds,
            'spectrogram_type': spectrogram_type,
            'n_fft': n_fft,
            'window_type': window_type,
            'n_mels': n_mels,
            'fmin': fmin,
            'fmax': fmax,
            'normalize_mel_bands': normalize_mel_bands,
            'htk': htk,
            'n_mfcc': n_mfcc,
            'omit_zeroth': omit_zeroth
        })

        super(MfccStaticExtractor, self).__init__(**kwargs)

        # Run MfccStaticExtractor init
        MfccStaticExtractor.__init__(self, **kwargs)

        self.width = width

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(MfccDeltaExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='Delta', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='width', value=self.width) + '\n'
        return output

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(1, t)]
            MFCC delta

        """

        mfccs = super(MfccDeltaExtractor, self).extract(y=y)
        return librosa.feature.delta(mfccs, width=self.width, order=1, axis=-1)


[docs]class MfccAccelerationExtractor(MfccStaticExtractor):
    """Feature extractor class to extract MFCC acceleration features"""
    label = 'mfcc_acceleration'  #: Extractor label
    description = 'MFCC acceleration (Librosa)'  #: Extractor description

[docs]    def __init__(self, fs=44100,
                 win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
                 spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
                 n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False,
                 n_mfcc=20, omit_zeroth=False,
                 width=9,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.
            Default value 44100

        win_length_samples : int
            Window length in samples.
            Default value None

        hop_length_samples : int
            Hop length in samples.
            Default value None

        win_length_seconds : float
            Window length in seconds.
            Default value 0.04

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        spectrogram_type : str
            Spectrogram type, magnitude or power spectrogram.
            Default value 'magnitude'

        n_fft : int
            Length of the FFT window.
            Default value 2048

        window_type : str
            Window function type.
            Default value 'hamming_asymmetric'

        n_mels : int
            Number of mel bands to generate.
            Default value 40

        fmin : int
            Lowest frequency in mel bands (in Hz).
            Default value 0

        fmax : int
            Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0.
            Default value None

        normalize_mel_bands : bool
            Normalize mel band to have peak at 1.0.
            Default value False

        htk : bool
            Use HTK formula for mel band creation instead of Slaney.
            Default value False

        n_mfcc : int
            Number of MFCC coefficients.
            Default value 20

        omit_zeroth : bool
            Omit 0th coefficient.
            Default value False

        width : int
            Width of the delta window.
            Default value 9

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': win_length_samples,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': win_length_seconds,
            'hop_length_seconds': hop_length_seconds,
            'spectrogram_type': spectrogram_type,
            'n_fft': n_fft,
            'window_type': window_type,
            'n_mels': n_mels,
            'fmin': fmin,
            'fmax': fmax,
            'normalize_mel_bands': normalize_mel_bands,
            'htk': htk,
            'n_mfcc': n_mfcc,
            'omit_zeroth': omit_zeroth
        })

        super(MfccAccelerationExtractor, self).__init__(**kwargs)

        # Run MfccStaticExtractor init
        MfccStaticExtractor.__init__(self, **kwargs)

        self.width = width

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(MfccAccelerationExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='Acceleration', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='width', value=self.width) + '\n'
        return output

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(1, t)]
            MFCC acceleration

        """

        mfccs = super(MfccAccelerationExtractor, self).extract(y=y)
        return librosa.feature.delta(mfccs, width=self.width, order=2, axis=-1)


[docs]class ZeroCrossingRateExtractor(FeatureExtractor):
    """Feature extractor class to extract zero crossing rate features"""
    label = 'zcr'  #: Extractor label
    description = 'Zero crossing rate (Librosa)'  #: Extractor description

[docs]    def __init__(self,
                 fs=44100,
                 win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
                 center=True,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.
            Default value 44100

        win_length_samples : int
            Window length in samples.
            Default value None

        hop_length_samples : int
            Hop length in samples.
            Default value None

        win_length_seconds : float
            Window length in seconds.
            Default value 0.04

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        center : bool
            If True, frames are centered by padding the edges of signal.
            Default value True

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': win_length_samples,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': win_length_seconds,
            'hop_length_seconds': hop_length_seconds,
        })

        super(ZeroCrossingRateExtractor, self).__init__(**kwargs)

        self.center = center

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(ZeroCrossingRateExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='ZCR', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'

        return output

    def __getstate__(self):
        d = super(ZeroCrossingRateExtractor, self).__getstate__()
        d.update({
            'center': self.center
        })

        return d

    def __setstate__(self, d):
        super(ZeroCrossingRateExtractor, self).__setstate__(d)

        self.center = d['center']

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(1, t)]
            zero crossing rate

        """

        from dcase_util.containers import AudioContainer
        if isinstance(y, AudioContainer):
            y = y.data

        return librosa.feature.zero_crossing_rate(
            y=y,
            frame_length=self.win_length_samples,
            hop_length=self.hop_length_samples,
            center=self.center
        ).reshape((1, -1))


[docs]class RMSEnergyExtractor(SpectralFeatureExtractor):
    """Feature extractor class to extract Root-mean-square energy features"""
    label = 'rmse'  #: Extractor label
    description = 'Root-mean-square energy (Librosa)'  #: Extractor description

[docs]    def __init__(self,
                 fs=44100,
                 win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
                 spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
                 center=True,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.
            Default value 44100

        win_length_samples : int
            Window length in samples.
            Default value None

        hop_length_samples : int
            Hop length in samples.
            Default value None

        win_length_seconds : float
            Window length in seconds.
            Default value 0.04

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        spectrogram_type : str
            Spectrogram type, magnitude or power spectrogram.
            Default value 'magnitude'

        n_fft : int
            Length of the FFT window.
            Default value 2048

        window_type : str
            Window function type.
            Default value 'hamming_asymmetric'

        center : bool
            If True, frames are centered by padding the edges of signal.
            Default value True

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': win_length_samples,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': win_length_seconds,
            'hop_length_seconds': hop_length_seconds,
            'spectrogram_type': spectrogram_type,
            'n_fft': n_fft,
            'window_type': window_type
        })

        # Run SpectralFeatureExtractor init
        SpectralFeatureExtractor.__init__(self, **kwargs)

        super(RMSEnergyExtractor, self).__init__(**kwargs)

        self.spectrogram_type = 'magnitude'
        self.center = center

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(RMSEnergyExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='RMSEnergy', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'

        return output

    def __getstate__(self):
        d = super(RMSEnergyExtractor, self).__getstate__()
        d.update({
            'center': self.center
        })

        return d

    def __setstate__(self, d):
        super(RMSEnergyExtractor, self).__setstate__(d)

        self.center = d['center']

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(1, t)]
            rmse

        """

        spectrogram = self.get_spectrogram(
            y=y,
            n_fft=self.n_fft,
            win_length_samples=self.win_length_samples,
            hop_length_samples=self.hop_length_samples,
            spectrogram_type=self.spectrogram_type,
            center=self.center,
            window=self.window
        )

        return librosa.feature.rms(
            S=spectrogram
        ).reshape((1, -1))


[docs]class SpectralCentroidExtractor(SpectralFeatureExtractor):
    """Feature extractor class to extract Centroid features"""
    label = 'centroid'  #: Extractor label
    description = 'Centroid (Librosa)'  #: Extractor description

[docs]    def __init__(self,
                 fs=44100,
                 win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
                 spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
                 center=True,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal.
            Default value 44100

        win_length_samples : int
            Window length in samples.
            Default value None

        hop_length_samples : int
            Hop length in samples.
            Default value None

        win_length_seconds : float
            Window length in seconds.
            Default value 0.04

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        spectrogram_type : str
            Spectrogram type, magnitude or power spectrogram.
            Default value 'magnitude'

        n_fft : int
            Length of the FFT window.
            Default value 2048

        window_type : str
            Window function type.
            Default value 'hamming_asymmetric'

        center : bool
            If true, input signal is padded so to the frame is centered at hop length
            Default value True

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': win_length_samples,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': win_length_seconds,
            'hop_length_seconds': hop_length_seconds,
            'spectrogram_type': spectrogram_type,
            'n_fft': n_fft,
            'window_type': window_type
        })

        # Run SpectralFeatureExtractor init
        SpectralFeatureExtractor.__init__(self, **kwargs)

        super(SpectralCentroidExtractor, self).__init__(**kwargs)

        self.spectrogram_type = 'magnitude'
        self.center = center

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(SpectralCentroidExtractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='SpectralCentroid', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'

        return output

    def __getstate__(self):
        d = super(SpectralCentroidExtractor, self).__getstate__()
        d.update({
            'center': self.center
        })

        return d

    def __setstate__(self, d):
        super(SpectralCentroidExtractor, self).__setstate__(d)

        self.center = d['center']

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        numpy.ndarray [shape=(1, t)]
            spectral centroid

        """
        spectrogram = self.get_spectrogram(
            y=y,
            n_fft=self.n_fft,
            win_length_samples=self.win_length_samples,
            hop_length_samples=self.hop_length_samples,
            spectrogram_type=self.spectrogram_type,
            center=self.center,
            window=self.window
        )

        return librosa.feature.spectral_centroid(
            S=spectrogram).reshape((1, -1))


[docs]class EmbeddingExtractor(FeatureExtractor):
    """Embedding extractor base class"""
    label = 'embedding'  #: Extractor label
    description = 'Embedding extractor base class'  #: Extractor description

[docs]    def __init__(self, **kwargs):
        """Constructor

        """

        super(EmbeddingExtractor, self).__init__(**kwargs)

        # Run FeatureExtractor init
        FeatureExtractor.__init__(self, **kwargs)


    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(EmbeddingExtractor, self).to_string(ui=ui, indent=indent)

        return output

    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : AudioContainer or numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        None

        """

        pass


[docs]class OpenL3Extractor(EmbeddingExtractor):
    """OpenL3 Embedding extractor class"""
    label = 'openl3'  #: Extractor label
    description = 'OpenL3 (embedding)'  #: Extractor description

[docs]    def __init__(self, fs=48000, hop_length_samples=None, hop_length_seconds=0.02,
                 model=None, input_repr='mel256', content_type='music',
                 embedding_size=6144,
                 center=True, batch_size=32, verbose=False,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal. If not 48kHz audio will be resampled.
            Default value 48000

        hop_length_samples : int
            Hop length in samples.
            Default value None

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        model : keras.models.Model or None
            Loaded model object. If a model is provided, then `input_repr`, `content_type`, and `embedding_size` will be ignored. If None is provided, the model will be loaded using the provided values of `input_repr`, `content_type` and `embedding_size`.
            Default value None

        input_repr : "linear", "mel128", or "mel256"
            Spectrogram representation used for model. Ignored if `model` is
            a valid Keras model.
            Default value "mel256"

        content_type : "music" or "env"
            Type of content used to train the embedding model. Ignored if `model` is
            a valid Keras model.
            Default value "music"

        embedding_size : 6144 or 512
            Embedding dimensionality. Ignored if `model` is a valid Keras model.
            Default value 6144

        center : bool
            If True, pads beginning of signal so timestamps correspond to center of window.
            Default value True

        batch_size : int
            Batch size used for input to embedding model
            Default value 32

        verbose : bool
            If True, prints verbose messages.
            Default value False

        """
        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': fs,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': 1.0,
            'hop_length_seconds': hop_length_seconds,
        })

        # Run EmbeddingExtractor init
        EmbeddingExtractor.__init__(self, **kwargs)

        super(OpenL3Extractor, self).__init__(**kwargs)

        self.model = model
        self.input_repr = input_repr
        self.content_type = content_type
        self.embedding_size = embedding_size
        self.center = center
        self.batch_size = batch_size
        self.verbose = verbose

        try:
            # Suppress tensorflow warnings
            import os
            os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
            import logging
            logging.getLogger('tensorflow').setLevel(logging.FATAL)

            import openl3

        except ImportError:
            message = '{name}: Unable to import OpenL3 module. You can install it with `pip install openl3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        if self.model is None:
            self.model = openl3.models.load_audio_embedding_model(
                input_repr=self.input_repr ,
                content_type=self.content_type,
                embedding_size=self.embedding_size
            )

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(OpenL3Extractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='OpenL3Extractor', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='input_repr', value=self.input_repr) + '\n'
        output += ui.data(indent=indent + 2, field='content_type', value=self.content_type) + '\n'
        output += ui.data(indent=indent + 2, field='embedding_size', value=self.embedding_size) + '\n'
        output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
        output += ui.data(indent=indent + 2, field='batch_size', value=self.batch_size) + '\n'
        output += ui.data(indent=indent + 2, field='verbose', value=self.verbose) + '\n'

        return output

    def __getstate__(self):
        d = super(OpenL3Extractor, self).__getstate__()
        d.update({
            'input_repr': self.input_repr,
            'content_type': self.content_type,
            'embedding_size': self.embedding_size,
            'center': self.center,
            'batch_size': self.batch_size,
            'verbose': self.verbose
        })

        return d

    def __setstate__(self, d):
        super(OpenL3Extractor, self).__setstate__(d)
        self.input_repr =  d['input_repr']
        self.content_type =  d['content_type']
        self.embedding_size =  d['embedding_size']
        self.center =  d['center']
        self.batch_size =  d['batch_size']
        self.verbose =  d['verbose']

        try:
            import openl3

        except ImportError:
            message = '{name}: Unable to import OpenL3 module. You can install it with `pip install openl3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        self.model = openl3.models.load_audio_embedding_model(
            input_repr=self.input_repr,
            content_type=self.content_type,
            embedding_size=self.embedding_size
        )

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        embedding : np.ndarray [shape=(T, D)] or list[np.ndarray]
            Array of embeddings for each window or list of such arrays for multiple audio clips.

        """
        try:
            import openl3

        except ImportError:
            message = '{name}: Unable to import OpenL3 module. You can install it with `pip install openl3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        embedding, timestamps = openl3.get_audio_embedding(
            audio=y,
            sr=self.fs,
            model=self.model,
            center=self.center,
            hop_size=self.hop_length_seconds,
            batch_size=self.batch_size,
            verbose=self.verbose
        )
        return embedding.T


[docs]class TorchOpenL3Extractor(EmbeddingExtractor):
    """TorchOpenL3 Embedding extractor class"""
    label = 'torchopenl3'  #: Extractor label
    description = 'TorchOpenL3 (embedding)'  #: Extractor description

[docs]    def __init__(self, fs=48000, hop_length_samples=None, hop_length_seconds=0.02,
                 model=None, input_repr='mel256', content_type="music",
                 embedding_size=6144,
                 center=True, batch_size=32, sampler="resampy",
                 verbose=False,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal. If not 48kHz audio will be resampled.
            Default value 48000

        hop_length_samples : int
            Hop length in samples.
            Default value None

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        model : keras.models.Model or None
            Loaded model object. If a model is provided, then `input_repr`, `content_type`, and `embedding_size` will be ignored. If None is provided, the model will be loaded using the provided values of `input_repr`, `content_type` and `embedding_size`.
            Default value None

        input_repr : "linear", "mel128", or "mel256"
            Spectrogram representation used for model. Ignored if `model` is
            a valid Keras model.
            Default value "mel256"

        content_type : "music" or "env"
            Type of content used to train the embedding model. Ignored if `model` is
            a valid Keras model.
            Default value "music"

        embedding_size : 6144 or 512
            Embedding dimensionality. Ignored if `model` is a valid Keras model.
            Default value 6144

        center : bool
            If True, pads beginning of signal so timestamps correspond to center of window.
            Default value True

        batch_size : int
            Batch size used for input to embedding model
            Default value 32

        sampler : str
            Resampling library to be used. Possible values are "resampy" or "julian"
            Default value "resampy"

        verbose : bool
            If True, prints verbose messages.
            Default value False

        """
        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': fs,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': 1.0,
            'hop_length_seconds': hop_length_seconds,
        })

        # Run EmbeddingExtractor init
        EmbeddingExtractor.__init__(self, **kwargs)

        super(TorchOpenL3Extractor, self).__init__(**kwargs)

        self.model = model
        self.input_repr = input_repr
        self.content_type = content_type
        self.embedding_size = embedding_size
        self.center = center
        self.batch_size = batch_size
        self.sampler = sampler
        self.verbose = verbose

        try:
            import torchopenl3

        except ImportError:
            message = '{name}: Unable to import TorchOpenL3 module. You can install it with `pip install torchopenl3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        if self.model is None:
            self.model = torchopenl3.models.load_audio_embedding_model(
                input_repr=self.input_repr ,
                content_type=self.content_type,
                embedding_size=self.embedding_size
            )

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(TorchOpenL3Extractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='TorchOpenL3Extractor', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='input_repr', value=self.input_repr) + '\n'
        output += ui.data(indent=indent + 2, field='content_type', value=self.content_type) + '\n'
        output += ui.data(indent=indent + 2, field='embedding_size', value=self.embedding_size) + '\n'
        output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
        output += ui.data(indent=indent + 2, field='batch_size', value=self.batch_size) + '\n'
        output += ui.data(indent=indent + 2, field='sampler', value=self.sampler) + '\n'
        output += ui.data(indent=indent + 2, field='verbose', value=self.verbose) + '\n'

        return output

    def __getstate__(self):
        d = super(TorchOpenL3Extractor, self).__getstate__()
        d.update({
            'input_repr': self.input_repr,
            'content_type': self.content_type,
            'embedding_size': self.embedding_size,
            'center': self.center,
            'batch_size': self.batch_size,
            'sampler': self.sampler,
            'verbose': self.verbose
        })

        return d

    def __setstate__(self, d):
        super(TorchOpenL3Extractor, self).__setstate__(d)
        self.input_repr =  d['input_repr']
        self.content_type =  d['content_type']
        self.embedding_size =  d['embedding_size']
        self.center =  d['center']
        self.batch_size =  d['batch_size']
        self.sampler = d['sampler']
        self.verbose =  d['verbose']

        try:
            import torchopenl3

        except ImportError:
            message = '{name}: Unable to import OpenL3 module. You can install it with `pip install torchopenl3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        self.model = torchopenl3.models.load_audio_embedding_model(
            input_repr=self.input_repr,
            content_type=self.content_type,
            embedding_size=self.embedding_size
        )

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        embedding : np.ndarray [shape=(T, D)] or list[np.ndarray]
            Array of embeddings for each window or list of such arrays for multiple audio clips.

        """
        try:
            import torchopenl3

        except ImportError:
            message = '{name}: Unable to import OpenL3 module. You can install it with `pip install torchopenl3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        embedding, timestamps = torchopenl3.get_audio_embedding(
            audio=y,
            sr=self.fs,
            model=self.model,
            center=self.center,
            hop_size=self.hop_length_seconds,
            batch_size=self.batch_size,
            sampler=self.sampler,
            verbose=self.verbose
        )
        return embedding.T.cpu().detach().numpy()

    def forward(self, y):
        """Extract features for the audio signal, using torch.Tensors

        Parameters
        ----------
        y : torch.Tensor [shape=(n,)]
            Audio signal

        Returns
        -------
        embedding : torch.Tensor [shape=(T, D)]
            Tensor of embeddings for each window or list of such arrays for multiple audio clips.

        """
        try:
            import torchopenl3

        except ImportError:
            message = '{name}: Unable to import OpenL3 module. You can install it with `pip install torchopenl3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        embedding, timestamps = torchopenl3.get_audio_embedding(
            audio=y,
            sr=self.fs,
            model=self.model,
            center=self.center,
            hop_size=self.hop_length_seconds,
            batch_size=self.batch_size,
            sampler=self.sampler,
            verbose=self.verbose
        )
        return embedding.T


[docs]class EdgeL3Extractor(EmbeddingExtractor):
    """EdgeL3 Embedding extractor class"""
    label = 'edgel3'  #: Extractor label
    description = 'EdgeL3 (embedding)'  #: Extractor description

[docs]    def __init__(self, fs=48000, hop_length_samples=None, hop_length_seconds=0.02,
                 model=None, retrain_type='ft', sparsity=95.45,
                 center=True, verbose=False,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        fs : int
            Sampling rate of the incoming signal. If not 48kHz audio will be resampled.
            Default value 48000

        hop_length_samples : int
            Hop length in samples.
            Default value None

        hop_length_seconds : float
            Hop length in seconds.
            Default value 0.02

        model : keras.models.Model or None
            Loaded model object. If a model is provided, then `sparsity` will be ignored. If None is provided, the model will be loaded using the provided `sparsity` value.
            Default value None

        retrain_type : {'ft', 'kd'}
            Type of retraining for the sparsified weights of L3 audio model. 'ft' chooses the fine-tuning method
            and 'kd' returns knowledge distilled model.
            Default value "ft"

        sparsity : {95.45, 53.5, 63.5, 72.3, 73.5, 81.0, 87.0, 90.5}
            The desired sparsity of audio model.
            Default value 95.45

        center : bool
            If True, pads beginning of signal so timestamps correspond to center of window.
            Default value True

        verbose : bool
            If True, prints verbose messages.
            Default value False

        """

        # Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
        kwargs.update({
            'fs': fs,
            'win_length_samples': fs,
            'hop_length_samples': hop_length_samples,
            'win_length_seconds': 1.0,
            'hop_length_seconds': hop_length_seconds,
        })

        # Run EmbeddingExtractor init
        EmbeddingExtractor.__init__(self, **kwargs)

        super(EdgeL3Extractor, self).__init__(**kwargs)

        self.model = model
        self.retrain_type = retrain_type
        self.sparsity = sparsity
        self.center = center
        self.verbose = verbose

        try:
            # Suppress tensorflow warnings
            import os
            os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
            import logging
            logging.getLogger('tensorflow').setLevel(logging.FATAL)

            import edgel3

        except ImportError:
            message = '{name}: Unable to import EdgeL3 module. You can install it with `pip install edgel3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        if self.model is None:
            self.model = edgel3.models.load_embedding_model(
                retrain_type=self.retrain_type ,
                sparsity=self.sparsity
            )

    def to_string(self, ui=None, indent=0):
        """Get container information in a string

        Parameters
        ----------
        ui : FancyStringifier or FancyHTMLStringifier
            Stringifier class
            Default value FancyStringifier

        indent : int
            Amount of indention used
            Default value 0

        Returns
        -------
        str

        """

        if ui is None:
            ui = FancyStringifier()

        output = super(EdgeL3Extractor, self).to_string(ui=ui, indent=indent)

        output += ui.line(field='EdgeL3Extractor', indent=indent) + '\n'
        output += ui.data(indent=indent + 2, field='retrain_type', value=self.retrain_type) + '\n'
        output += ui.data(indent=indent + 2, field='sparsity', value=self.sparsity) + '\n'
        output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
        output += ui.data(indent=indent + 2, field='verbose', value=self.verbose) + '\n'

        return output

    def __getstate__(self):
        d = super(EdgeL3Extractor, self).__getstate__()
        d.update({
            'retrain_type': self.retrain_type,
            'sparsity': self.sparsity,
            'center': self.center,
            'verbose': self.verbose
        })

        return d

    def __setstate__(self, d):
        super(EdgeL3Extractor, self).__setstate__(d)
        self.retrain_type = d['retrain_type']
        self.sparsity = d['sparsity']
        self.center = d['center']
        self.verbose = d['verbose']

        try:
            import edgel3

        except ImportError:
            message = '{name}: Unable to import EdgeL3 module. You can install it with `pip install edgel3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        self.model = edgel3.models.load_embedding_model(
            retrain_type=self.retrain_type,
            sparsity=self.sparsity
        )

[docs]    def extract(self, y):
        """Extract features for the audio signal.

        Parameters
        ----------
        y : numpy.ndarray [shape=(n,)]
            Audio signal

        Returns
        -------
        embedding : np.ndarray [shape=(T, D)] or list[np.ndarray]
            Array of embeddings for each window or list of such arrays for multiple audio clips.

        """

        try:
            import edgel3

        except ImportError:
            message = '{name}: Unable to import EdgeL3 module. You can install it with `pip install edgel3`.'.format(
                name=self.__class__.__name__
            )
            self.logger.exception(message)
            raise ImportError(message)

        embedding, timestamps = edgel3.get_embedding(
            audio=y,
            sr=self.fs,
            model=self.model,
            center=self.center,
            hop_size=self.hop_length_seconds,
            verbose=self.verbose
        )
        return embedding.T