Source code for dcase_util.features.features
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, absolute_import
import numpy
import librosa
import scipy
import logging
import importlib
from dcase_util.containers import ContainerMixin
from dcase_util.ui import FancyStringifier, FancyHTMLStringifier
from dcase_util.utils import setup_logging, get_class_inheritors, is_jupyter
def feature_extractor_list(display=True):
"""List of feature extractors available
Parameters
----------
display : bool
Display list immediately, otherwise return string
Default value True
Returns
-------
str
Multi line string containing extractor table
"""
class_list = get_class_inheritors(FeatureExtractor)
class_list.sort(key=lambda x: x.__name__, reverse=False)
class_names = []
labels = []
descriptions = []
for extractor_class in class_list:
if not extractor_class.__name__.endswith('Processor'):
e = extractor_class()
class_names.append(extractor_class.__name__)
labels.append(e.label)
descriptions.append(e.description)
if is_jupyter():
ui = FancyHTMLStringifier()
else:
ui = FancyStringifier()
output = ui.table(
cell_data=[class_names, labels, descriptions],
column_headers=['Class name', 'Feature label', 'Description'],
column_types=['str30', 'str20', 'str50'],
column_separators=[0, 1]
)
if display:
if is_jupyter():
from IPython.core.display import display, HTML
display(HTML(output))
else:
print(output)
else:
return output
def feature_extractor_factory(feature_extractor_label, **kwargs):
"""Function to get correct feature extractor class instance based on extractor label or class name.
Parameters
----------
feature_extractor_label : str
Class name or extractor label
Raises
------
NameError
Class does not exists
Returns
-------
Feature extractor class instance
"""
try:
feature_extractor_class = None
# Get all classes inherited from FeatureExtractor
class_list = get_class_inheritors(FeatureExtractor)
# Search correct feature extractor
for item in class_list:
if str(item.__name__) == feature_extractor_label:
feature_extractor_class = getattr(
importlib.import_module(str(item.__module__)),
feature_extractor_label
)
break
elif hasattr(item, 'label') and item.label == feature_extractor_label and item.__name__.endswith('Extractor'):
feature_extractor_class = getattr(
importlib.import_module(str(item.__module__)),
item.__name__
)
break
# Valid feature extractor class not found, raise error
if not feature_extractor_class:
raise AttributeError
except AttributeError:
message = 'Invalid FeatureExtractor class name or extractor label given [{label}].'.format(
label=feature_extractor_label
)
logger = logging.getLogger(__name__)
if not logger.handlers:
setup_logging()
logger.exception(message)
raise AttributeError(message)
return feature_extractor_class(**dict(kwargs))
[docs]class FeatureExtractor(ContainerMixin):
"""Feature extractor base class"""
label = 'extractor_base' #: Extractor label
description = 'Feature extractor base class' #: Extractor description
[docs] def __init__(self, fs=44100,
win_length_samples=None, hop_length_samples=None,
win_length_seconds=0.04, hop_length_seconds=0.02, **kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal
win_length_samples : int
Window length in samples
hop_length_samples : int
Hop length in samples
win_length_seconds : float
Window length in seconds
hop_length_seconds : float
Hop length in seconds
"""
# Run ContainerMixin init
ContainerMixin.__init__(self, **kwargs)
self.eps = numpy.spacing(1)
if fs is not None:
self.fs = fs
else:
message = '{name}: No fs set'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ValueError(message)
self.win_length_samples = win_length_samples
self.hop_length_samples = hop_length_samples
self.win_length_seconds = win_length_seconds
self.hop_length_seconds = hop_length_seconds
if not self.win_length_samples and self.win_length_seconds and self.fs:
self.win_length_samples = int(self.fs * self.win_length_seconds)
if not self.hop_length_samples and self.hop_length_seconds and self.fs:
self.hop_length_samples = int(self.fs * self.hop_length_seconds)
if self.win_length_samples is None:
message = '{name}: No win_length_samples set'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ValueError(message)
if self.hop_length_samples is None:
message = '{name}: No hop_length_samples set'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ValueError(message)
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = ''
output += ui.class_name(self.__class__.__name__, indent=indent) + '\n'
if hasattr(self, 'filename') and self.filename:
output += FancyStringifier().data(field='filename', value=self.filename, indent=indent) + '\n'
output += ui.data(field='fs', value=self.fs, indent=indent) + '\n'
output += ui.line(field='Frame blocking', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='hop_length_samples', value=self.hop_length_samples) + '\n'
output += ui.data(indent=indent + 2, field='hop_length_seconds', value=self.hop_length_seconds, unit='sec') + '\n'
output += ui.data(indent=indent + 2, field='win_length_samples', value=self.win_length_samples) + '\n'
output += ui.data(indent=indent + 2, field='win_length_seconds', value=self.win_length_seconds, unit='sec') + '\n'
return output
def __getstate__(self):
# Return only needed data for pickle
return {
'eps': self.eps,
'fs': self.fs,
'win_length_samples': self.win_length_samples,
'hop_length_samples': self.hop_length_samples,
'win_length_seconds': self.win_length_seconds,
'hop_length_seconds': self.hop_length_seconds
}
def __setstate__(self, d):
self.eps = d['eps']
self.fs = d['fs']
self.win_length_samples = d['win_length_samples']
self.hop_length_samples = d['hop_length_samples']
self.win_length_seconds = d['win_length_seconds']
self.hop_length_seconds = d['hop_length_seconds']
def __call__(self, *args, **kwargs):
return self.extract(*args, **kwargs)
def extract(self, y):
"""Extract features for the audio signal (PLACEHOLDER).
Parameters
----------
y : AudioContainer or numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
None
"""
pass
[docs]class SpectralFeatureExtractor(FeatureExtractor):
"""Spectral feature extractor base class"""
label = 'spectrogram' #: Extractor label
description = 'Spectral feature extractor base class (Librosa)' #: Extractor description
[docs] def __init__(self, spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric', **kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
win_length_samples : int
Window length in samples.
hop_length_samples : int
Hop length in samples.
win_length_seconds : float
Window length in seconds.
hop_length_seconds : float
Hop length in seconds.
spectrogram_type : str
Spectrogram type, magnitude or power spectrogram.
Default value 'magnitude'
n_fft : int
Length of the FFT window.
Default value 2048
window_type : str
Window function type.
Default value 'hamming_asymmetric'
"""
super(SpectralFeatureExtractor, self).__init__(**kwargs)
# Run FeatureExtractor init
FeatureExtractor.__init__(self, **kwargs)
self.spectrogram_type = spectrogram_type
self.n_fft = n_fft
self.window_type = window_type
self.window = self.get_window_function(
n=self.win_length_samples,
window_type=self.window_type
)
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(SpectralFeatureExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='Spectrogram', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='spectrogram_type', value=self.spectrogram_type) + '\n'
output += ui.data(indent=indent + 2, field='n_fft', value=self.n_fft) + '\n'
output += ui.data(indent=indent + 2, field='window_type', value=self.window_type) + '\n'
return output
[docs] def get_window_function(self, n, window_type='hamming_asymmetric'):
"""Window function
Parameters
----------
n : int
window length
window_type : str
window type
Default value 'hamming_asymmetric'
Raises
------
ValueError:
Unknown window type
Returns
-------
numpy.array
window function
"""
# Windowing function
if window_type == 'hamming_asymmetric':
return scipy.signal.hamming(n, sym=False)
elif window_type == 'hamming_symmetric' or window_type == 'hamming':
return scipy.signal.hamming(n, sym=True)
elif window_type == 'hann_asymmetric':
return scipy.signal.hann(n, sym=False)
elif window_type == 'hann_symmetric' or window_type == 'hann':
return scipy.signal.hann(n, sym=True)
else:
message = '{name}: Unknown window type [{window_type}]'.format(
name=self.__class__.__name__,
window_type=window_type
)
self.logger.exception(message)
raise ValueError(message)
[docs] def get_spectrogram(self, y, n_fft=None, win_length_samples=None, hop_length_samples=None,
window=None, center=True, spectrogram_type=None):
"""Spectrogram
Parameters
----------
y : numpy.ndarray
Audio data
n_fft : int
FFT size
Default value 2048
win_length_samples : int
Window length in samples
Default value None
hop_length_samples : int
Hop length in samples
Default value None
window : numpy.array
Window function
Default value None
center : bool
If true, input signal is padded so to the frame is centered at hop length
Default value True
spectrogram_type : str
Type of spectrogram "magnitude" or "power"
Default value None
Returns
-------
numpy.ndarray [shape=(1 + n_fft/2, t), dtype=dtype]
STFT matrix
"""
if n_fft is None:
n_fft = self.n_fft
if win_length_samples is None:
win_length_samples = self.win_length_samples
if hop_length_samples is None:
hop_length_samples = self.hop_length_samples
if window is None and self.window is not None:
window = self.window
if spectrogram_type is None:
spectrogram_type = self.spectrogram_type
from dcase_util.containers import AudioContainer
if isinstance(y, AudioContainer):
if y.channels == 1:
y = y.data
else:
message = '{name}: Input has more than one audio channel.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ValueError(message)
if spectrogram_type == 'magnitude':
return numpy.abs(librosa.stft(y + self.eps,
n_fft=n_fft,
win_length=win_length_samples,
hop_length=hop_length_samples,
center=center,
window=window
)
)
elif spectrogram_type == 'power':
return numpy.abs(librosa.stft(y + self.eps,
n_fft=n_fft,
win_length=win_length_samples,
hop_length=hop_length_samples,
center=center,
window=window
)) ** 2
else:
message = '{name}: Unknown spectrum type [{spectrogram_type}]'.format(
name=self.__class__.__name__,
spectrogram_type=spectrogram_type
)
self.logger.exception(message)
raise ValueError(message)
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : AudioContainer or numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(n_fft, t)]
spectrum
"""
return self.get_spectrogram(
y=y,
n_fft=self.n_fft,
win_length_samples=self.win_length_samples,
hop_length_samples=self.hop_length_samples,
spectrogram_type=self.spectrogram_type,
center=True,
window=self.window
)
[docs]class MelExtractor(SpectralFeatureExtractor):
"""Feature extractor class to extract mel band energy features"""
label = 'mel' #: Extractor label
description = 'Mel band energy (Librosa)' #: Extractor description
[docs] def __init__(self,
fs=44100,
win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False, logarithmic=True,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
win_length_samples : int
Window length in samples.
Default value None
hop_length_samples : int
Hop length in samples.
Default value None
win_length_seconds : float
Window length in seconds.
Default value 0.04
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
spectrogram_type : str
Spectrogram type, magnitude or power spectrogram.
Default value 'magnitude'
n_fft : int
Length of the FFT window.
Default value 2048
window_type : str
Window function type.
Default value 'hamming_asymmetric'
n_mels : int
Number of mel bands to generate
Default value 40
fmin : int
Lowest frequency in mel bands (in Hz)
Default value 0
fmax : int
Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0
Default value None
normalize_mel_bands : bool
Normalize mel band to have peak at 1.0
Default value False
htk : bool
Use HTK formula for mel band creation instead of Slaney
Default value False
logarithmic : bool
Switch for log mel-band energies
Default value True
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': win_length_samples,
'hop_length_samples': hop_length_samples,
'win_length_seconds': win_length_seconds,
'hop_length_seconds': hop_length_seconds,
'spectrogram_type': spectrogram_type,
'n_fft': n_fft,
'window_type': window_type
})
super(MelExtractor, self).__init__(**kwargs)
# Run SpectralFeatureExtractor init
SpectralFeatureExtractor.__init__(self, **kwargs)
self.n_mels = n_mels
self.fmin = fmin
self.fmax = fmax
self.normalize_mel_bands = normalize_mel_bands
self.htk = htk
self.logarithmic = logarithmic
self.mel_basis = librosa.filters.mel(
sr=self.fs,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
htk=self.htk
)
if self.normalize_mel_bands:
self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(MelExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='Mel', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='n_mels', value=self.n_mels) + '\n'
output += ui.data(indent=indent + 2, field='fmin', value=self.fmin) + '\n'
output += ui.data(indent=indent + 2, field='fmax', value=self.fmax if self.fmax is not None else 'None') + '\n'
output += ui.data(indent=indent + 2, field='normalize_mel_bands', value=self.normalize_mel_bands) + '\n'
output += ui.data(indent=indent + 2, field='htk', value=self.htk) + '\n'
output += ui.data(indent=indent + 2, field='logarithmic', value=self.logarithmic) + '\n'
return output
def __getstate__(self):
d = super(MelExtractor, self).__getstate__()
d.update({
'n_mels': self.n_mels,
'fmin': self.fmin,
'fmax': self.fmax,
'normalize_mel_bands': self.normalize_mel_bands,
'htk': self.htk,
'logarithmic': self.logarithmic,
})
return d
def __setstate__(self, d):
super(MelExtractor, self).__setstate__(d)
self.n_mels = d['n_mels']
self.fmin = d['fmin']
self.fmax = d['fmax']
self.normalize_mel_bands = d['normalize_mel_bands']
self.htk = d['htk']
self.logarithmic = d['logarithmic']
self.mel_basis = librosa.filters.mel(
sr=self.fs,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
htk=self.htk
)
if self.normalize_mel_bands:
self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : AudioContainer or numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(n_mels, t)]
mel band energies
"""
spectrogram = self.get_spectrogram(
y=y,
n_fft=self.n_fft,
win_length_samples=self.win_length_samples,
hop_length_samples=self.hop_length_samples,
spectrogram_type=self.spectrogram_type,
center=True,
window=self.window
)
mel_spectrum = numpy.dot(self.mel_basis, spectrogram)
if self.logarithmic:
mel_spectrum = numpy.log(mel_spectrum + self.eps)
return mel_spectrum
[docs]class MfccStaticExtractor(SpectralFeatureExtractor):
"""Feature extractor class to extract static MFCC features"""
label = 'mfcc' #: Extractor label
description = 'MFCC (Librosa)' #: Extractor description
[docs] def __init__(self,
fs=44100,
win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False,
n_mfcc=20, omit_zeroth=False,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
Default value 44100
win_length_samples : int
Window length in samples.
Default value None
hop_length_samples : int
Hop length in samples.
Default value None
win_length_seconds : float
Window length in seconds.
Default value 0.04
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
spectrogram_type : str
Spectrogram type, magnitude or power spectrogram.
Default value 'magnitude'
n_fft : int
Length of the FFT window.
Default value 2048
window_type : str
Window function type.
Default value 'hamming_asymmetric'
n_mels : int
Number of mel bands to generate.
Default value 40
fmin : int
Lowest frequency in mel bands (in Hz).
Default value 0
fmax : int
Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0
Default value None
normalize_mel_bands : bool
Normalize mel band to have peak at 1.0
Default value False
htk : bool
Use HTK formula for mel band creation instead of Slaney
Default value False
n_mfcc : int
Number of MFCC coefficients
Default value 20
omit_zeroth : bool
Omit 0th coefficient
Default value False
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': win_length_samples,
'hop_length_samples': hop_length_samples,
'win_length_seconds': win_length_seconds,
'hop_length_seconds': hop_length_seconds,
'spectrogram_type': spectrogram_type,
'n_fft': n_fft,
'window_type': window_type
})
super(MfccStaticExtractor, self).__init__(**kwargs)
# Run SpectralFeatureExtractor init
SpectralFeatureExtractor.__init__(self, **kwargs)
self.n_mels = n_mels
self.fmin = fmin
self.fmax = fmax
self.normalize_mel_bands = normalize_mel_bands
self.htk = htk
self.n_mfcc = n_mfcc
self.omit_zeroth = omit_zeroth
self.mel_basis = librosa.filters.mel(
sr=self.fs,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
htk=self.htk
)
if self.normalize_mel_bands:
self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(MfccStaticExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='MFCC', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='n_mels', value=self.n_mels) + '\n'
output += ui.data(indent=indent + 2, field='fmin', value=self.fmin) + '\n'
output += ui.data(indent=indent + 2, field='fmax', value=self.fmax) + '\n'
output += ui.data(indent=indent + 2, field='normalize_mel_bands', value=self.normalize_mel_bands) + '\n'
output += ui.data(indent=indent + 2, field='htk', value=self.htk) + '\n'
output += ui.data(indent=indent + 2, field='n_mfcc', value=self.n_mfcc) + '\n'
return output
def __getstate__(self):
d = super(MfccStaticExtractor, self).__getstate__()
d.update({
'n_mels': self.n_mels,
'fmin': self.fmin,
'fmax': self.fmax,
'normalize_mel_bands': self.normalize_mel_bands,
'htk': self.htk,
'n_mfcc': self.n_mfcc,
'omit_zeroth': self.omit_zeroth,
})
return d
def __setstate__(self, d):
super(MfccStaticExtractor, self).__setstate__(d)
self.n_mels = d['n_mels']
self.fmin = d['fmin']
self.fmax = d['fmax']
self.normalize_mel_bands = d['normalize_mel_bands']
self.htk = d['htk']
self.n_mfcc = d['n_mfcc']
self.omit_zeroth = d['omit_zeroth']
self.mel_basis = librosa.filters.mel(
sr=self.fs,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
htk=self.htk
)
if self.normalize_mel_bands:
self.mel_basis /= numpy.max(self.mel_basis, axis=-1)[:, None]
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(n_mels, t)]
mfccs
"""
spectrogram = self.get_spectrogram(
y=y,
n_fft=self.n_fft,
win_length_samples=self.win_length_samples,
hop_length_samples=self.hop_length_samples,
spectrogram_type=self.spectrogram_type,
center=True,
window=self.window
)
mel_spectrum = numpy.dot(self.mel_basis, spectrogram)
mfccs = librosa.feature.mfcc(
S=librosa.power_to_db(mel_spectrum),
n_mfcc=self.n_mfcc
)
if self.omit_zeroth:
# Remove first coefficient
mfccs = mfccs[1:, :]
return mfccs
[docs]class MfccDeltaExtractor(MfccStaticExtractor):
"""Feature extractor class to extract MFCC delta features"""
label = 'mfcc_delta' #: Extractor label
description = 'MFCC delta (Librosa)' #: Extractor description
[docs] def __init__(self,
fs=44100,
win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False,
n_mfcc=20, omit_zeroth=False,
width=9,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
Default value 44100
win_length_samples : int
Window length in samples.
Default value None
hop_length_samples : int
Hop length in samples.
Default value None
win_length_seconds : float
Window length in seconds.
Default value 0.04
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
spectrogram_type : str
Spectrogram type, magnitude or power spectrogram.
Default value 'magnitude'
n_fft : int
Length of the FFT window.
Default value 2048
window_type : str
Window function type.
Default value 'hamming_asymmetric'
n_mels : int
Number of mel bands to generate.
Default value 40
fmin : int
Lowest frequency in mel bands (in Hz).
Default value 0
fmax : int
Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0.
Default value None
normalize_mel_bands : bool
Normalize mel band to have peak at 1.0.
Default value False
htk : bool
Use HTK formula for mel band creation instead of Slaney.
Default value False
n_mfcc : int
Number of MFCC coefficients.
Default value 20
omit_zeroth : bool
Omit 0th coefficient.
Default value False
width : int
Width of the delta window.
Default value 9
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': win_length_samples,
'hop_length_samples': hop_length_samples,
'win_length_seconds': win_length_seconds,
'hop_length_seconds': hop_length_seconds,
'spectrogram_type': spectrogram_type,
'n_fft': n_fft,
'window_type': window_type,
'n_mels': n_mels,
'fmin': fmin,
'fmax': fmax,
'normalize_mel_bands': normalize_mel_bands,
'htk': htk,
'n_mfcc': n_mfcc,
'omit_zeroth': omit_zeroth
})
super(MfccStaticExtractor, self).__init__(**kwargs)
# Run MfccStaticExtractor init
MfccStaticExtractor.__init__(self, **kwargs)
self.width = width
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(MfccDeltaExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='Delta', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='width', value=self.width) + '\n'
return output
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(1, t)]
MFCC delta
"""
mfccs = super(MfccDeltaExtractor, self).extract(y=y)
return librosa.feature.delta(mfccs, width=self.width, order=1, axis=-1)
[docs]class MfccAccelerationExtractor(MfccStaticExtractor):
"""Feature extractor class to extract MFCC acceleration features"""
label = 'mfcc_acceleration' #: Extractor label
description = 'MFCC acceleration (Librosa)' #: Extractor description
[docs] def __init__(self, fs=44100,
win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
n_mels=40, fmin=0, fmax=None, normalize_mel_bands=False, htk=False,
n_mfcc=20, omit_zeroth=False,
width=9,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
Default value 44100
win_length_samples : int
Window length in samples.
Default value None
hop_length_samples : int
Hop length in samples.
Default value None
win_length_seconds : float
Window length in seconds.
Default value 0.04
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
spectrogram_type : str
Spectrogram type, magnitude or power spectrogram.
Default value 'magnitude'
n_fft : int
Length of the FFT window.
Default value 2048
window_type : str
Window function type.
Default value 'hamming_asymmetric'
n_mels : int
Number of mel bands to generate.
Default value 40
fmin : int
Lowest frequency in mel bands (in Hz).
Default value 0
fmax : int
Highest frequency in mel bands (in Hz), if None, fmax = fs/2.0.
Default value None
normalize_mel_bands : bool
Normalize mel band to have peak at 1.0.
Default value False
htk : bool
Use HTK formula for mel band creation instead of Slaney.
Default value False
n_mfcc : int
Number of MFCC coefficients.
Default value 20
omit_zeroth : bool
Omit 0th coefficient.
Default value False
width : int
Width of the delta window.
Default value 9
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': win_length_samples,
'hop_length_samples': hop_length_samples,
'win_length_seconds': win_length_seconds,
'hop_length_seconds': hop_length_seconds,
'spectrogram_type': spectrogram_type,
'n_fft': n_fft,
'window_type': window_type,
'n_mels': n_mels,
'fmin': fmin,
'fmax': fmax,
'normalize_mel_bands': normalize_mel_bands,
'htk': htk,
'n_mfcc': n_mfcc,
'omit_zeroth': omit_zeroth
})
super(MfccAccelerationExtractor, self).__init__(**kwargs)
# Run MfccStaticExtractor init
MfccStaticExtractor.__init__(self, **kwargs)
self.width = width
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(MfccAccelerationExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='Acceleration', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='width', value=self.width) + '\n'
return output
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(1, t)]
MFCC acceleration
"""
mfccs = super(MfccAccelerationExtractor, self).extract(y=y)
return librosa.feature.delta(mfccs, width=self.width, order=2, axis=-1)
[docs]class ZeroCrossingRateExtractor(FeatureExtractor):
"""Feature extractor class to extract zero crossing rate features"""
label = 'zcr' #: Extractor label
description = 'Zero crossing rate (Librosa)' #: Extractor description
[docs] def __init__(self,
fs=44100,
win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
center=True,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
Default value 44100
win_length_samples : int
Window length in samples.
Default value None
hop_length_samples : int
Hop length in samples.
Default value None
win_length_seconds : float
Window length in seconds.
Default value 0.04
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
center : bool
If True, frames are centered by padding the edges of signal.
Default value True
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': win_length_samples,
'hop_length_samples': hop_length_samples,
'win_length_seconds': win_length_seconds,
'hop_length_seconds': hop_length_seconds,
})
super(ZeroCrossingRateExtractor, self).__init__(**kwargs)
self.center = center
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(ZeroCrossingRateExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='ZCR', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
return output
def __getstate__(self):
d = super(ZeroCrossingRateExtractor, self).__getstate__()
d.update({
'center': self.center
})
return d
def __setstate__(self, d):
super(ZeroCrossingRateExtractor, self).__setstate__(d)
self.center = d['center']
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(1, t)]
zero crossing rate
"""
from dcase_util.containers import AudioContainer
if isinstance(y, AudioContainer):
y = y.data
return librosa.feature.zero_crossing_rate(
y=y,
frame_length=self.win_length_samples,
hop_length=self.hop_length_samples,
center=self.center
).reshape((1, -1))
[docs]class RMSEnergyExtractor(SpectralFeatureExtractor):
"""Feature extractor class to extract Root-mean-square energy features"""
label = 'rmse' #: Extractor label
description = 'Root-mean-square energy (Librosa)' #: Extractor description
[docs] def __init__(self,
fs=44100,
win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
center=True,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
Default value 44100
win_length_samples : int
Window length in samples.
Default value None
hop_length_samples : int
Hop length in samples.
Default value None
win_length_seconds : float
Window length in seconds.
Default value 0.04
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
spectrogram_type : str
Spectrogram type, magnitude or power spectrogram.
Default value 'magnitude'
n_fft : int
Length of the FFT window.
Default value 2048
window_type : str
Window function type.
Default value 'hamming_asymmetric'
center : bool
If True, frames are centered by padding the edges of signal.
Default value True
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': win_length_samples,
'hop_length_samples': hop_length_samples,
'win_length_seconds': win_length_seconds,
'hop_length_seconds': hop_length_seconds,
'spectrogram_type': spectrogram_type,
'n_fft': n_fft,
'window_type': window_type
})
# Run SpectralFeatureExtractor init
SpectralFeatureExtractor.__init__(self, **kwargs)
super(RMSEnergyExtractor, self).__init__(**kwargs)
self.spectrogram_type = 'magnitude'
self.center = center
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(RMSEnergyExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='RMSEnergy', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
return output
def __getstate__(self):
d = super(RMSEnergyExtractor, self).__getstate__()
d.update({
'center': self.center
})
return d
def __setstate__(self, d):
super(RMSEnergyExtractor, self).__setstate__(d)
self.center = d['center']
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(1, t)]
rmse
"""
spectrogram = self.get_spectrogram(
y=y,
n_fft=self.n_fft,
win_length_samples=self.win_length_samples,
hop_length_samples=self.hop_length_samples,
spectrogram_type=self.spectrogram_type,
center=self.center,
window=self.window
)
return librosa.feature.rms(
S=spectrogram
).reshape((1, -1))
[docs]class SpectralCentroidExtractor(SpectralFeatureExtractor):
"""Feature extractor class to extract Centroid features"""
label = 'centroid' #: Extractor label
description = 'Centroid (Librosa)' #: Extractor description
[docs] def __init__(self,
fs=44100,
win_length_samples=None, hop_length_samples=None, win_length_seconds=0.04, hop_length_seconds=0.02,
spectrogram_type='magnitude', n_fft=2048, window_type='hamming_asymmetric',
center=True,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal.
Default value 44100
win_length_samples : int
Window length in samples.
Default value None
hop_length_samples : int
Hop length in samples.
Default value None
win_length_seconds : float
Window length in seconds.
Default value 0.04
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
spectrogram_type : str
Spectrogram type, magnitude or power spectrogram.
Default value 'magnitude'
n_fft : int
Length of the FFT window.
Default value 2048
window_type : str
Window function type.
Default value 'hamming_asymmetric'
center : bool
If true, input signal is padded so to the frame is centered at hop length
Default value True
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': win_length_samples,
'hop_length_samples': hop_length_samples,
'win_length_seconds': win_length_seconds,
'hop_length_seconds': hop_length_seconds,
'spectrogram_type': spectrogram_type,
'n_fft': n_fft,
'window_type': window_type
})
# Run SpectralFeatureExtractor init
SpectralFeatureExtractor.__init__(self, **kwargs)
super(SpectralCentroidExtractor, self).__init__(**kwargs)
self.spectrogram_type = 'magnitude'
self.center = center
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(SpectralCentroidExtractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='SpectralCentroid', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
return output
def __getstate__(self):
d = super(SpectralCentroidExtractor, self).__getstate__()
d.update({
'center': self.center
})
return d
def __setstate__(self, d):
super(SpectralCentroidExtractor, self).__setstate__(d)
self.center = d['center']
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
numpy.ndarray [shape=(1, t)]
spectral centroid
"""
spectrogram = self.get_spectrogram(
y=y,
n_fft=self.n_fft,
win_length_samples=self.win_length_samples,
hop_length_samples=self.hop_length_samples,
spectrogram_type=self.spectrogram_type,
center=self.center,
window=self.window
)
return librosa.feature.spectral_centroid(
S=spectrogram).reshape((1, -1))
[docs]class EmbeddingExtractor(FeatureExtractor):
"""Embedding extractor base class"""
label = 'embedding' #: Extractor label
description = 'Embedding extractor base class' #: Extractor description
[docs] def __init__(self, **kwargs):
"""Constructor
"""
super(EmbeddingExtractor, self).__init__(**kwargs)
# Run FeatureExtractor init
FeatureExtractor.__init__(self, **kwargs)
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(EmbeddingExtractor, self).to_string(ui=ui, indent=indent)
return output
def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : AudioContainer or numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
None
"""
pass
[docs]class OpenL3Extractor(EmbeddingExtractor):
"""OpenL3 Embedding extractor class"""
label = 'openl3' #: Extractor label
description = 'OpenL3 (embedding)' #: Extractor description
[docs] def __init__(self, fs=48000, hop_length_samples=None, hop_length_seconds=0.02,
model=None, input_repr='mel256', content_type='music',
embedding_size=6144,
center=True, batch_size=32, verbose=False,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal. If not 48kHz audio will be resampled.
Default value 48000
hop_length_samples : int
Hop length in samples.
Default value None
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
model : keras.models.Model or None
Loaded model object. If a model is provided, then `input_repr`, `content_type`, and `embedding_size` will be ignored. If None is provided, the model will be loaded using the provided values of `input_repr`, `content_type` and `embedding_size`.
Default value None
input_repr : "linear", "mel128", or "mel256"
Spectrogram representation used for model. Ignored if `model` is
a valid Keras model.
Default value "mel256"
content_type : "music" or "env"
Type of content used to train the embedding model. Ignored if `model` is
a valid Keras model.
Default value "music"
embedding_size : 6144 or 512
Embedding dimensionality. Ignored if `model` is a valid Keras model.
Default value 6144
center : bool
If True, pads beginning of signal so timestamps correspond to center of window.
Default value True
batch_size : int
Batch size used for input to embedding model
Default value 32
verbose : bool
If True, prints verbose messages.
Default value False
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': fs,
'hop_length_samples': hop_length_samples,
'win_length_seconds': 1.0,
'hop_length_seconds': hop_length_seconds,
})
# Run EmbeddingExtractor init
EmbeddingExtractor.__init__(self, **kwargs)
super(OpenL3Extractor, self).__init__(**kwargs)
self.model = model
self.input_repr = input_repr
self.content_type = content_type
self.embedding_size = embedding_size
self.center = center
self.batch_size = batch_size
self.verbose = verbose
try:
# Suppress tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import logging
logging.getLogger('tensorflow').setLevel(logging.FATAL)
import openl3
except ImportError:
message = '{name}: Unable to import OpenL3 module. You can install it with `pip install openl3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
if self.model is None:
self.model = openl3.models.load_audio_embedding_model(
input_repr=self.input_repr ,
content_type=self.content_type,
embedding_size=self.embedding_size
)
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(OpenL3Extractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='OpenL3Extractor', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='input_repr', value=self.input_repr) + '\n'
output += ui.data(indent=indent + 2, field='content_type', value=self.content_type) + '\n'
output += ui.data(indent=indent + 2, field='embedding_size', value=self.embedding_size) + '\n'
output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
output += ui.data(indent=indent + 2, field='batch_size', value=self.batch_size) + '\n'
output += ui.data(indent=indent + 2, field='verbose', value=self.verbose) + '\n'
return output
def __getstate__(self):
d = super(OpenL3Extractor, self).__getstate__()
d.update({
'input_repr': self.input_repr,
'content_type': self.content_type,
'embedding_size': self.embedding_size,
'center': self.center,
'batch_size': self.batch_size,
'verbose': self.verbose
})
return d
def __setstate__(self, d):
super(OpenL3Extractor, self).__setstate__(d)
self.input_repr = d['input_repr']
self.content_type = d['content_type']
self.embedding_size = d['embedding_size']
self.center = d['center']
self.batch_size = d['batch_size']
self.verbose = d['verbose']
try:
import openl3
except ImportError:
message = '{name}: Unable to import OpenL3 module. You can install it with `pip install openl3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
self.model = openl3.models.load_audio_embedding_model(
input_repr=self.input_repr,
content_type=self.content_type,
embedding_size=self.embedding_size
)
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
embedding : np.ndarray [shape=(T, D)] or list[np.ndarray]
Array of embeddings for each window or list of such arrays for multiple audio clips.
"""
try:
import openl3
except ImportError:
message = '{name}: Unable to import OpenL3 module. You can install it with `pip install openl3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
embedding, timestamps = openl3.get_audio_embedding(
audio=y,
sr=self.fs,
model=self.model,
center=self.center,
hop_size=self.hop_length_seconds,
batch_size=self.batch_size,
verbose=self.verbose
)
return embedding.T
[docs]class TorchOpenL3Extractor(EmbeddingExtractor):
"""TorchOpenL3 Embedding extractor class"""
label = 'torchopenl3' #: Extractor label
description = 'TorchOpenL3 (embedding)' #: Extractor description
[docs] def __init__(self, fs=48000, hop_length_samples=None, hop_length_seconds=0.02,
model=None, input_repr='mel256', content_type="music",
embedding_size=6144,
center=True, batch_size=32, sampler="resampy",
verbose=False,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal. If not 48kHz audio will be resampled.
Default value 48000
hop_length_samples : int
Hop length in samples.
Default value None
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
model : keras.models.Model or None
Loaded model object. If a model is provided, then `input_repr`, `content_type`, and `embedding_size` will be ignored. If None is provided, the model will be loaded using the provided values of `input_repr`, `content_type` and `embedding_size`.
Default value None
input_repr : "linear", "mel128", or "mel256"
Spectrogram representation used for model. Ignored if `model` is
a valid Keras model.
Default value "mel256"
content_type : "music" or "env"
Type of content used to train the embedding model. Ignored if `model` is
a valid Keras model.
Default value "music"
embedding_size : 6144 or 512
Embedding dimensionality. Ignored if `model` is a valid Keras model.
Default value 6144
center : bool
If True, pads beginning of signal so timestamps correspond to center of window.
Default value True
batch_size : int
Batch size used for input to embedding model
Default value 32
sampler : str
Resampling library to be used. Possible values are "resampy" or "julian"
Default value "resampy"
verbose : bool
If True, prints verbose messages.
Default value False
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': fs,
'hop_length_samples': hop_length_samples,
'win_length_seconds': 1.0,
'hop_length_seconds': hop_length_seconds,
})
# Run EmbeddingExtractor init
EmbeddingExtractor.__init__(self, **kwargs)
super(TorchOpenL3Extractor, self).__init__(**kwargs)
self.model = model
self.input_repr = input_repr
self.content_type = content_type
self.embedding_size = embedding_size
self.center = center
self.batch_size = batch_size
self.sampler = sampler
self.verbose = verbose
try:
import torchopenl3
except ImportError:
message = '{name}: Unable to import TorchOpenL3 module. You can install it with `pip install torchopenl3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
if self.model is None:
self.model = torchopenl3.models.load_audio_embedding_model(
input_repr=self.input_repr ,
content_type=self.content_type,
embedding_size=self.embedding_size
)
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(TorchOpenL3Extractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='TorchOpenL3Extractor', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='input_repr', value=self.input_repr) + '\n'
output += ui.data(indent=indent + 2, field='content_type', value=self.content_type) + '\n'
output += ui.data(indent=indent + 2, field='embedding_size', value=self.embedding_size) + '\n'
output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
output += ui.data(indent=indent + 2, field='batch_size', value=self.batch_size) + '\n'
output += ui.data(indent=indent + 2, field='sampler', value=self.sampler) + '\n'
output += ui.data(indent=indent + 2, field='verbose', value=self.verbose) + '\n'
return output
def __getstate__(self):
d = super(TorchOpenL3Extractor, self).__getstate__()
d.update({
'input_repr': self.input_repr,
'content_type': self.content_type,
'embedding_size': self.embedding_size,
'center': self.center,
'batch_size': self.batch_size,
'sampler': self.sampler,
'verbose': self.verbose
})
return d
def __setstate__(self, d):
super(TorchOpenL3Extractor, self).__setstate__(d)
self.input_repr = d['input_repr']
self.content_type = d['content_type']
self.embedding_size = d['embedding_size']
self.center = d['center']
self.batch_size = d['batch_size']
self.sampler = d['sampler']
self.verbose = d['verbose']
try:
import torchopenl3
except ImportError:
message = '{name}: Unable to import OpenL3 module. You can install it with `pip install torchopenl3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
self.model = torchopenl3.models.load_audio_embedding_model(
input_repr=self.input_repr,
content_type=self.content_type,
embedding_size=self.embedding_size
)
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
embedding : np.ndarray [shape=(T, D)] or list[np.ndarray]
Array of embeddings for each window or list of such arrays for multiple audio clips.
"""
try:
import torchopenl3
except ImportError:
message = '{name}: Unable to import OpenL3 module. You can install it with `pip install torchopenl3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
embedding, timestamps = torchopenl3.get_audio_embedding(
audio=y,
sr=self.fs,
model=self.model,
center=self.center,
hop_size=self.hop_length_seconds,
batch_size=self.batch_size,
sampler=self.sampler,
verbose=self.verbose
)
return embedding.T.cpu().detach().numpy()
def forward(self, y):
"""Extract features for the audio signal, using torch.Tensors
Parameters
----------
y : torch.Tensor [shape=(n,)]
Audio signal
Returns
-------
embedding : torch.Tensor [shape=(T, D)]
Tensor of embeddings for each window or list of such arrays for multiple audio clips.
"""
try:
import torchopenl3
except ImportError:
message = '{name}: Unable to import OpenL3 module. You can install it with `pip install torchopenl3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
embedding, timestamps = torchopenl3.get_audio_embedding(
audio=y,
sr=self.fs,
model=self.model,
center=self.center,
hop_size=self.hop_length_seconds,
batch_size=self.batch_size,
sampler=self.sampler,
verbose=self.verbose
)
return embedding.T
[docs]class EdgeL3Extractor(EmbeddingExtractor):
"""EdgeL3 Embedding extractor class"""
label = 'edgel3' #: Extractor label
description = 'EdgeL3 (embedding)' #: Extractor description
[docs] def __init__(self, fs=48000, hop_length_samples=None, hop_length_seconds=0.02,
model=None, retrain_type='ft', sparsity=95.45,
center=True, verbose=False,
**kwargs):
"""Constructor
Parameters
----------
fs : int
Sampling rate of the incoming signal. If not 48kHz audio will be resampled.
Default value 48000
hop_length_samples : int
Hop length in samples.
Default value None
hop_length_seconds : float
Hop length in seconds.
Default value 0.02
model : keras.models.Model or None
Loaded model object. If a model is provided, then `sparsity` will be ignored. If None is provided, the model will be loaded using the provided `sparsity` value.
Default value None
retrain_type : {'ft', 'kd'}
Type of retraining for the sparsified weights of L3 audio model. 'ft' chooses the fine-tuning method
and 'kd' returns knowledge distilled model.
Default value "ft"
sparsity : {95.45, 53.5, 63.5, 72.3, 73.5, 81.0, 87.0, 90.5}
The desired sparsity of audio model.
Default value 95.45
center : bool
If True, pads beginning of signal so timestamps correspond to center of window.
Default value True
verbose : bool
If True, prints verbose messages.
Default value False
"""
# Inject parameters for the parent classes back to kwargs. For the convenience they are expose explicitly here.
kwargs.update({
'fs': fs,
'win_length_samples': fs,
'hop_length_samples': hop_length_samples,
'win_length_seconds': 1.0,
'hop_length_seconds': hop_length_seconds,
})
# Run EmbeddingExtractor init
EmbeddingExtractor.__init__(self, **kwargs)
super(EdgeL3Extractor, self).__init__(**kwargs)
self.model = model
self.retrain_type = retrain_type
self.sparsity = sparsity
self.center = center
self.verbose = verbose
try:
# Suppress tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import logging
logging.getLogger('tensorflow').setLevel(logging.FATAL)
import edgel3
except ImportError:
message = '{name}: Unable to import EdgeL3 module. You can install it with `pip install edgel3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
if self.model is None:
self.model = edgel3.models.load_embedding_model(
retrain_type=self.retrain_type ,
sparsity=self.sparsity
)
def to_string(self, ui=None, indent=0):
"""Get container information in a string
Parameters
----------
ui : FancyStringifier or FancyHTMLStringifier
Stringifier class
Default value FancyStringifier
indent : int
Amount of indention used
Default value 0
Returns
-------
str
"""
if ui is None:
ui = FancyStringifier()
output = super(EdgeL3Extractor, self).to_string(ui=ui, indent=indent)
output += ui.line(field='EdgeL3Extractor', indent=indent) + '\n'
output += ui.data(indent=indent + 2, field='retrain_type', value=self.retrain_type) + '\n'
output += ui.data(indent=indent + 2, field='sparsity', value=self.sparsity) + '\n'
output += ui.data(indent=indent + 2, field='center', value=self.center) + '\n'
output += ui.data(indent=indent + 2, field='verbose', value=self.verbose) + '\n'
return output
def __getstate__(self):
d = super(EdgeL3Extractor, self).__getstate__()
d.update({
'retrain_type': self.retrain_type,
'sparsity': self.sparsity,
'center': self.center,
'verbose': self.verbose
})
return d
def __setstate__(self, d):
super(EdgeL3Extractor, self).__setstate__(d)
self.retrain_type = d['retrain_type']
self.sparsity = d['sparsity']
self.center = d['center']
self.verbose = d['verbose']
try:
import edgel3
except ImportError:
message = '{name}: Unable to import EdgeL3 module. You can install it with `pip install edgel3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
self.model = edgel3.models.load_embedding_model(
retrain_type=self.retrain_type,
sparsity=self.sparsity
)
[docs] def extract(self, y):
"""Extract features for the audio signal.
Parameters
----------
y : numpy.ndarray [shape=(n,)]
Audio signal
Returns
-------
embedding : np.ndarray [shape=(T, D)] or list[np.ndarray]
Array of embeddings for each window or list of such arrays for multiple audio clips.
"""
try:
import edgel3
except ImportError:
message = '{name}: Unable to import EdgeL3 module. You can install it with `pip install edgel3`.'.format(
name=self.__class__.__name__
)
self.logger.exception(message)
raise ImportError(message)
embedding, timestamps = edgel3.get_embedding(
audio=y,
sr=self.fs,
model=self.model,
center=self.center,
hop_size=self.hop_length_seconds,
verbose=self.verbose
)
return embedding.T