Source code for ketos.audio.gammatone

# ================================================================================ #
#   Authors: Fabio Frazao and Oliver Kirsebom                                      #
#   Contact: fsfrazao@dal.ca, oliver.kirsebom@dal.ca                               #
#   Organization: MERIDIAN (https://meridian.cs.dal.ca/)                           #
#   Team: Data Analytics                                                           #
#   Project: ketos                                                                 #
#   Project goal: The ketos library provides functionalities for handling          #
#   and processing acoustic data and applying deep neural networks to sound        #
#   detection and classification tasks.                                            #
#                                                                                  #
#   License: GNU GPLv3                                                             #
#                                                                                  #
#       This program is free software: you can redistribute it and/or modify       #
#       it under the terms of the GNU General Public License as published by       #
#       the Free Software Foundation, either version 3 of the License, or          #
#       (at your option) any later version.                                        #
#                                                                                  #
#       This program is distributed in the hope that it will be useful,            #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#       GNU General Public License for more details.                               # 
#                                                                                  #
#       You should have received a copy of the GNU General Public License          #
#       along with this program.  If not, see <https://www.gnu.org/licenses/>.     #
# ================================================================================ #

""" 'audio.gammatone' module within the ketos library.

    This module provides utilities to process audio data 
    using a auditory filterbank based on the gammatone function.

    Contents:
        GammatoneFilterBank class:
        AuralFeatures class
"""
import sys
import os
import copy
import warnings
import numpy as np
from version_parser.version import Version
import scipy
import matplotlib.pyplot as plt
from ketos.audio.waveform import Waveform
from ketos.audio.annotation import AnnotationHandler
from ketos.audio.base_audio import BaseAudio, BaseAudioTime, segment_data


gammatone_filter_coeff = dict()


[docs]def compute_center_freqs(num_chan, sampl_rate, freq_min): """ Compute the center frequencies of the filter bank Args: num_chan: int Number of channels in the filter bank sampl_rate: float Sampling rate in Hz freq_min: float Minimum frequency in Hz Returns: freqs: float Center frequencies in Hz """ f0 = 228.8 i = np.linspace(1, num_chan, num_chan) freqs = np.exp(i / num_chan * (np.log(freq_min + f0) - np.log(sampl_rate / 2. + f0))) * (sampl_rate / 2. + f0) - f0 freqs = np.flip(freqs) return freqs
[docs]def filter_signal(signal, sampl_rate, freqs): """ Pass the signal through the gammatone filters Args: signal: numpy.array Audio signal sampl_rate: float Sampling rate in Hz freqs: numpy.array Center frequencies of the filter bank Returns: x: numpy.array The filtered signals stacked vertically into 2D array """ x = [] for freq in freqs: (b, a) = get_filter_coeffs(sampl_rate, freq) x.append(scipy.signal.filtfilt(b, a, signal)) x = np.array(x) x = np.swapaxes(x, 0, 1) return x
[docs]def apply_weight_func(x, freqs): """ Apply C weighting function. This weighting function represents the approximate frequency sensitivity of the human auditory system. Args: x: numpy.array The filtered signals freqs: numpy.array Center frequencies of the filter bank Returns: x: numpy.array The C-weighted filtered signals """ C = 1.007 * 12200**2 * freqs**2 / (freqs**2 + 20.6**2) / (freqs**2 + 12200**2) return x * C[np.newaxis,:]
[docs]def get_filter_coeffs(sampl_rate, freq): """ Get the gammatone filter coefficients. Args: sampl_rate: float Sampling rate in Hz freq: float Center frequency in Hz Returns: : tuple Coefficients of the gammatone filter """ key = (sampl_rate, freq) if key not in gammatone_filter_coeff.keys(): gammatone_filter_coeff[key] = compute_filter_coeffs(sampl_rate, freq) return gammatone_filter_coeff[key]
[docs]def compute_filter_coeffs(sampl_rate, freq): """ Compute the gammatone filter coefficients. Args: sampl_rate: float Sampling rate in Hz freq: float Center frequency in Hz Returns: (b, a): tuple Coefficients of the gammatone filter """ if Version(scipy.__version__) < Version("1.6.0"): print('The `compute_filter_coeffs` method in the `gammatone` module requires Scipy>=1.6.0') print(f'The present environment only has Scipy=={Version(scipy.__version__)}') print('Note that Scipy>=1.6.0 requires Python>=3.7.0') exit(1) b, a = scipy.signal.gammatone(freq=freq, ftype='iir', fs=sampl_rate) return (b, a)
[docs]class GammatoneFilterBank(BaseAudioTime): """ Gammatone filter bank. The filtered signals are stored in a 2D numpy array, where the first axis (0) is the time dimension and the second axis (1) is the frequency dimension. Args: data: 2d numpy array Filtered data rate: float Sampling rate in Hz freqs: array-like Center frequencies of the filter bank in Hz filename: str or list(str) Name of the source audio file, if available. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file, if available. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional weight_func: bool Apply C weighting function. Default is True. Attributes: data: 2d numpy array Filtered data rate: float Sampling rate in Hz freqs: array-like Center frequencies of the filter bank in Hz filename: str or list(str) Name of the source audio file, if available. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file, if available. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional weight_func: bool Apply C weighting function. """ def __init__(self, data, rate, freqs, filename=None, offset=0, label=None, annot=None, weight_func=True, **kwargs): self.rate = rate self.freqs = freqs self.weight_func = weight_func super().__init__(data=data, time_res=1./rate, filename=filename, offset=offset, label=label, annot=annot)
[docs] @classmethod def empty(cls): """ Creates an empty GammatoneFilterBank object """ return cls(data=np.empty(shape=(0,0), dtype=np.float64), rate=1, freqs=[])
[docs] @classmethod def from_waveform(cls, audio, num_chan=20, freq_min=1, weight_func=True): """ Create a Gammatone Filter Bank from an instance of :class:`audio_signal.Waveform`. Args: audio: Waveform Audio signal num_chan: int Number of channels in the filter bank freq_min: float Minimum frequency of the filter bank in Hz weight_func: bool Apply C weighting function. Default is True. Returns: gfb: GammatoneFilterBank Gammatone filter bank """ center_freqs = compute_center_freqs(num_chan=num_chan, sampl_rate=audio.rate, freq_min=freq_min) filtered_signals = filter_signal(signal=audio.data, sampl_rate=audio.rate, freqs=center_freqs) if weight_func: filtered_signals = apply_weight_func(x=filtered_signals, freqs=center_freqs) gfb = cls(data=filtered_signals, rate=audio.rate, freqs=center_freqs, filename=audio.filename, offset=audio.offset, label=audio.label, annot=audio.annot, weight_func=weight_func) return gfb
[docs] @classmethod def from_wav(cls, path, num_chan=20, freq_min=1, channel=0, rate=None, offset=0, duration=None, resample_method='scipy', id=None, normalize_wav=False, weight_func=True, **kwargs): """ Create a Gammatone Filter Bank directly from wav file. The arguments offset and duration can be used to select a portion of the wav file. Note that values specified for the arguments offset and duration may be subject to slight adjustments to ensure that the selected portion corresponds to an integer number of samples. Args: path: str Path to wav file num_chan: int Number of channels in the filter bank freq_min: float Minimum frequency of the filter bank in Hz channel: int Channel to read from. Only relevant for stereo recordings rate: float Desired sampling rate in Hz. If None, the original sampling rate will be used offset: float Start time of selection in seconds, relative the start of the wav file. duration: float Length of selection in seconds. resample_method: str Resampling method. Only relevant if `rate` is specified. Options are * kaiser_best * kaiser_fast * scipy (default) * polyphase See https://librosa.github.io/librosa/generated/librosa.core.resample.html for details on the individual methods. id: str Unique identifier (optional). If None, the filename will be used. normalize_wav: bool Normalize the waveform to have a mean of zero (mean=0) and a standard deviation of unity (std=1) before computing the spectrogram. Default is False. weight_func: bool Apply C weighting function. Default is True. Returns: : GammatoneFilterBank Gammatone filter bank Example: >>> # load gammatone filter bank from wav file >>> from ketos.audio.gammatone import GammatoneFilterBank >>> gfb = GammatoneFilterBank.from_wav('ketos/tests/assets/grunt1.wav', num_chan=20, freq_min=10, rate=1000) >>> # print the center frequencies rounded to 1 decimal >>> print(np.round(gfb.freqs,1)) [ 10. 23.7 38.2 53.5 69.7 86.8 104.9 124.1 144.3 165.7 188.4 212.3 237.6 264.4 292.7 322.6 354.2 387.7 423.1 460.5] >>> # display the 4th filter bank signal >>> fig = gfb.plot(filter_id=3) >>> fig.savefig("ketos/tests/assets/tmp/gfb3_grunt1.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/gfb3_grunt1.png """ # load audio audio = Waveform.from_wav(path=path, channel=channel, rate=rate, offset=offset, duration=duration, resample_method=resample_method, id=id, normalize_wav=normalize_wav, **kwargs) if len(audio.get_data()) == 0: warnings.warn("Empty GammatoneFilterBank returned", RuntimeWarning) return cls.empty() # compute gammatone filter bank return cls.from_waveform(audio=audio, num_chan=num_chan, freq_min=freq_min, weight_func=weight_func)
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'rate':self.rate, 'freqs':self.freqs, 'weight_func':self.weight_func, 'type':self.__class__.__name__}) return attrs
[docs] def plot(self, filter_id, show_annot=False, figsize=(5,4), label_in_title=True, show_envelope=False): """ Plot the filtered signal with proper axes ranges and labels. Optionally, also display annotations as boxes superimposed on the signal. Note: The resulting figure can be shown (fig.show()) or saved (fig.savefig(file_name)) Args: filter_id: int Filter to be plotted. show_annot: bool Display annotations figsize: tuple Figure size label_in_title: bool Include label (if available) in figure title show_envelope: bool Display envelope on top of signal Returns: : matplotlib.figure.Figure A figure object. Example: >>> from ketos.audio.gammatone import GammatoneFilterBank >>> # load gammatone filter bank >>> gfb = GammatoneFilterBank.from_wav('ketos/tests/assets/grunt1.wav', num_chan=20, freq_min=10, rate=1000) >>> # add an annotation >>> gfb.annotate(start=1.2, end=1.6, freq_min=70, freq_max=600, label=1) >>> # show the 4th filter bank with annotation box >>> fig = gfb.plot(filter_id=3, show_annot=True) >>> fig.savefig("ketos/tests/assets/tmp/gfb3_w_annot_box.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/gfb3_w_annot_box.png """ x = self.get_data()[:,filter_id] # select the filtered signal wf = Waveform(data=x, rate=self.rate, filename=self.get_filename(), offset=self.get_offset(), label=self.get_label(), annot=self.get_annotations()) return wf.plot(show_annot=show_annot, figsize=figsize, label_in_title=label_in_title, append_title=f', {self.freqs[filter_id]:.1f} Hz', show_envelope=show_envelope)
[docs]class AuralFeatures(BaseAudio): """ Aural features computed with the aural-features package (https://pypi.org/project/aural-features/). Args: data: 1d numpy array Feature values filename: str Name of the source audio file, if available. offset: float Position in seconds of the left edge of the audio segment within the source audio file, if available. label: int Label. Optional annot: AnnotationHandler AnnotationHandler object. Optional """ def __init__(self, data, filename=None, offset=0, label=None, annot=None, waveform_transform_log=None, **kwargs): super().__init__(data=data, filename=filename, offset=offset, label=label, annot=annot) if waveform_transform_log is None: waveform_transform_log = [] self.waveform_transform_log = waveform_transform_log
[docs] @classmethod def from_waveform(cls, audio, filter_pad_samples=64, global_km_window_seconds=0.25, local_km_window_seconds=0.008, filter_n=100, filter_min_hz=50): """ Compute aural features from an instance of :class:`audio_signal.Waveform`. Args: audio: Waveform Audio signal filter_pad_samples: int Number of samples used for padding global_km_window_seconds: float Length of global KM window in seconds local_km_window_seconds: float Length of local KM window in seconds filter_n: int Number of filters filter_min_hz: float Min filter frequency in Hz Returns: : AuralFeatures Aural features """ if 'aural' not in sys.modules: try: import aural.meridian as au except ImportError: print('aural-features package not found.') print('aural-features is required by the AuralFeatures class.') print('install with `pip install aural-features`.') print('note that aural-features requires Scipy>=1.6 and Python>=3.7') raise ImportError conf = au.Config() # For defaults, should be safe to start conf.filter_pad_samples = filter_pad_samples conf.global_km_window_seconds = global_km_window_seconds conf.local_km_window_seconds = local_km_window_seconds conf.filter_n = filter_n conf.filter_min_hz = filter_min_hz try: duration, features = au.extract(audio.get_data(), audio.rate, conf) values = [duration] for x in features: for name, value in x._asdict().items(): values.append(value) values = np.array(values) values = np.nan_to_num(values, nan=0.0) #replace NaN's with zeros except au.IsolationFailed: values = np.zeros(46) return cls(data=values, filename=audio.filename, offset=audio.offset, label=audio.label, annot=audio.annot)
[docs] @classmethod def from_wav(cls, path, filter_pad_samples=64, global_km_window_seconds=0.25, local_km_window_seconds=0.008, filter_n=100, filter_min_hz=50, channel=0, rate=None, offset=0, duration=None, resample_method='scipy', id=None, normalize_wav=False, waveform_transforms=None, **kwargs): """ Compute aural features directly from wav file. The arguments offset and duration can be used to select a portion of the wav file. Note that values specified for the arguments offset and duration may be subject to slight adjustments to ensure that the selected portion corresponds to an integer number of samples. Args: path: str Path to wav file filter_min: float Min filter frequency in Hz local_km_window: float Length of local KM window in seconds channel: int Channel to read from. Only relevant for stereo recordings rate: float Desired sampling rate in Hz. If None, the original sampling rate will be used offset: float Start time of selection in seconds, relative the start of the wav file. duration: float Length of selection in seconds. resample_method: str Resampling method. Only relevant if `rate` is specified. Options are * kaiser_best * kaiser_fast * scipy (default) * polyphase See https://librosa.github.io/librosa/generated/librosa.core.resample.html for details on the individual methods. id: str Unique identifier (optional). If None, the filename will be used. normalize_wav: bool Normalize the waveform to have a mean of zero (mean=0) and a standard deviation of unity (std=1) before computing the spectrogram. Default is False. waveform_transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the waveform before generating the spectrogram. For example, {"name":"add_gaussian_noise", "sigma":0.5} Returns: : AuralFeatures Aural features """ # load audio audio = Waveform.from_wav(path=path, channel=channel, rate=rate, offset=offset, duration=duration, resample_method=resample_method, id=id, normalize_wav=normalize_wav, transforms=waveform_transforms, **kwargs) if len(audio.get_data()) == 0: warnings.warn("Empty AuralFeatures returned", RuntimeWarning) return cls.empty() # compute gammatone filter bank return cls.from_waveform(audio=audio, filter_pad_samples=filter_pad_samples, global_km_window_seconds=global_km_window_seconds, local_km_window_seconds=local_km_window_seconds, filter_n=filter_n, filter_min_hz=filter_min_hz)
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'type':self.__class__.__name__}) return attrs