Source code for ketos.audio.spectrogram

# ================================================================================ #
#   Authors: Fabio Frazao and Oliver Kirsebom                                      #
#   Contact: fsfrazao@dal.ca, oliver.kirsebom@dal.ca                               #
#   Organization: MERIDIAN (https://meridian.cs.dal.ca/)                           #
#   Team: Data Analytics                                                           #
#   Project: ketos                                                                 #
#   Project goal: The ketos library provides functionalities for handling          #
#   and processing acoustic data and applying deep neural networks to sound        #
#   detection and classification tasks.                                            #
#                                                                                  #
#   License: GNU GPLv3                                                             #
#                                                                                  #
#       This program is free software: you can redistribute it and/or modify       #
#       it under the terms of the GNU General Public License as published by       #
#       the Free Software Foundation, either version 3 of the License, or          #
#       (at your option) any later version.                                        #
#                                                                                  #
#       This program is distributed in the hope that it will be useful,            #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#       GNU General Public License for more details.                               # 
#                                                                                  #
#       You should have received a copy of the GNU General Public License          #
#       along with this program.  If not, see <https://www.gnu.org/licenses/>.     #
# ================================================================================ #

""" 'audio.spectrogram' module within the ketos library.

    This module provides utilities to work with spectrograms.

    Spectrograms are two-dimensional visual representations of 
    sound waves, in which time is shown along the horizontal 
    axis, frequency along the vertical axis, and color is used 
    to indicate the sound amplitude. Read more on Wikipedia:
    https://en.wikipedia.org/wiki/Spectrogram

    The module contains the parent class Spectrogram, and four
    child classes (MagSpectrogram, PowerSpectrogram, MelSpectrogram, 
    CQTSpectrogram), which inherit methods and attributes from the 
    parent class.

    Note, however, that not all methods (e.g. crop) work for all 
    child classes. See the documentation of the individual methods 
    for further details.

    Contents:
        Spectrogram class:
        MagSpectrogram class:
        PowerSpectrogram class:
        MelSpectrogram class:
        CQTSpectrogram class
"""
import os
import copy
import warnings
import numpy as np
from scipy.signal import get_window
from scipy import ndimage
from skimage.transform import resize
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from ketos.audio.waveform import Waveform, get_duration, get_sampling_rate, _validate_wf_args
import ketos.audio.utils.misc as aum
from ketos.audio.utils.axis import LinearAxis, Log2Axis, MelAxis
from ketos.audio.annotation import AnnotationHandler
from ketos.audio.utils.filter import enhance_signal, reduce_tonal_noise
from ketos.audio.base_audio import BaseAudioTime, segment_data


[docs]def add_specs(a, b, offset=0, scale=1, make_copy=False): """ Place two spectrograms on top of one another by adding their pixel values. The spectrograms must be of the same type, and share the same time resolution. The spectrograms must have consistent frequency axes. For linear frequency axes, this implies having the same resolution; for logarithmic axes with base 2, this implies having the same number of bins per octave minimum values that differ by a factor of :math:`2^{n/m}` where :math:`m` is the number of bins per octave and :math:`n` is any integer. No check is made for the consistency of the frequency axes. Note that the attributes filename, offset, and label of spectrogram `b` is being added are lost. The sum spectrogram has the same dimensions (time x frequency) as spectrogram `a`. Args: a: Spectrogram Spectrogram b: Spectrogram Spectrogram to be added offset: float Shift spectrogram `b` by this many seconds relative to spectrogram `a`. scale: float Scaling factor applied to signal that is added make_copy: bool Make copies of both spectrograms, leaving the orignal instances unchanged by the addition operation. Returns: ab: Spectrogram Sum spectrogram """ assert a.type == b.type, "It is not possible to add spectrograms with different types" assert a.time_res() == b.time_res(), 'It is not possible to add spectrograms with different time resolutions' # make copy if make_copy: ab = a.deepcopy() else: ab = a # compute cropping boundaries for time axis end = a.duration() - offset # determine position of b within a pos_x = a.time_ax.bin(offset, truncate=True) #lower left corner time bin pos_y = a.freq_ax.bin(b.freq_min(), truncate=True) #lower left corner frequency bin # crop spectrogram b b = b.crop(start=-offset, end=end, freq_min=a.freq_min(), freq_max=a.freq_max(), make_copy=make_copy) # add the two images bins_x = b.data.shape[0] bins_y = b.data.shape[1] ab.data[pos_x:pos_x+bins_x, pos_y:pos_y+bins_y] += scale * b.data return ab
[docs]def load_audio_for_spec(path, channel, rate, window, step, offset, duration, resample_method, id=None, normalize_wav=False, waveform_transforms=None, smooth=0.01, **kwargs): """ Load audio data from a wav file for the specific purpose of computing the spectrogram. The loaded audio covers a time interval that extends slightly beyond that specified, [offset, offset+duration], as needed to compute the full spectrogram without padding with zeros at either end. Moreover, the returned instance has two extra class attributes not usually associated with instances of the Waveform class, * stft_args: dict Parameters to be used for the computation of the Short-Time Fourier transform * len_extend: tuple(int,int) Length (no. samples) by which the time interval has been extended at both ends (left, right). Returns None if the requested data segment is empty. Args: path: str Path to wav file channel: int Channel to read from. Only relevant for stereo recordings rate: float Desired sampling rate in Hz. If None, the original sampling rate will be used window: float Window size in seconds that will be used for computing the spectrogram step: float Step size in seconds that will be used for computing the spectrogram offset: float Start time of spectrogram in seconds, relative the start of the wav file. duration: float Length of spectrogrma in seconds. resample_method: str Resampling method. Only relevant if `rate` is specified. Options are: * kaiser_best * kaiser_fast * scipy (default) * polyphase See https://librosa.github.io/librosa/generated/librosa.core.resample.html for details on the individual methods. id: str Unique identifier (optional). If None, the filename will be used. normalize_wav: bool Normalize the waveform to have a mean of zero (mean=0) and a standard deviation of unity (std=1). Default is False. smooth: float Width in seconds of the smoothing region used for stitching together audio files. \**kwargs: additional keyword arguments Keyword arguments to be passed to :meth:`ketos.audio.Waveform.from_wav`. Returns: audio: Waveform The audio signal """ path, offset, duration = _validate_wf_args(path, offset, duration) # make copies so we don't change the input arguments offset_ext = offset.copy() duration_ext = duration.copy() if rate is None: rate = get_sampling_rate(path=path) duration_ext = get_duration(path=path, offset=offset, duration=duration_ext) total_duration = np.sum(duration_ext) if total_duration <= 0: return None nominal_offset = offset[0] # compute the arguments for the short-time fourier transform stft_args = aum.segment_args(rate=rate, offset=nominal_offset, window=window, step=step, duration=total_duration) # modify offset and duration to extend audio segment at both ends offset_ext[0] = stft_args['offset_len'] / rate left_ext = nominal_offset - offset_ext[0] total_duration_ext = int(stft_args['num_segs'] * stft_args['step_len'] + stft_args['win_len']) / rate right_ext = total_duration_ext - total_duration - left_ext duration_ext[0] += left_ext duration_ext[-1] += right_ext # now load extended audio with from_wav method audio = Waveform.from_wav(path=path, rate=rate, channel=channel, offset=offset_ext, duration=duration_ext, resample_method=resample_method, id=id, normalize_wav=normalize_wav, transforms=waveform_transforms, smooth=smooth, **kwargs) if len(audio.get_data()) == 0: return None, None # make sure we don't pad twice stft_args["offset_len"] = 0 # use the correct offset value audio.offset = nominal_offset # create extra class attributes audio.stft_args = stft_args n_left_ext = aum.num_samples(left_ext, audio.rate) n_right_ext = aum.num_samples(right_ext, audio.rate) audio.len_extend = (n_left_ext, n_right_ext) return audio
[docs]class Spectrogram(BaseAudioTime): """ Spectrogram. Parent class for MagSpectrogram, PowerSpectrogram, MelSpectrogram, and CQTSpectrogram. The Spectrogram class stores the spectrogram pixel values in a numpy array, where the first axis (0) is the time dimension and the second axis (1) is the frequency dimensions. Args: data: numpy array Spectrogram matrix. time_res: float Time resolution in seconds (corresponds to the bin size used on the time axis) type: str Spectrogram type. Options include, * 'Mag': Magnitude spectrogram * 'Pow': Power spectrogram * 'Mel': Mel spectrogram * 'CQT': CQT spectrogram freq_ax: LinearAxis or Log2Axis Axis object for the frequency dimension filename: str or list(str) Name of the source audio file, if available. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file, if available. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} transform_log: list(dict) List of transforms that have been applied to this spectrogram waveform_transform_log: list(dict) List of transforms that have been applied to the waveform before generating this spectrogram Attributes: data: numpy array Spectrogram matrix. time_ax: LinearAxis Axis object for the time dimension freq_ax: LinearAxis or Log2Axis Axis object for the frequency dimension type: str Spectrogram type. Options include, * 'Mag': Magnitude spectrogram * 'Pow': Power spectrogram * 'Mel': Mel spectrogram * 'CQT': CQT spectrogram filename: str or list(str) Name of the source audio file. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file. label: int Spectrogram label. annot: AnnotationHandler AnnotationHandler object. transform_log: list(dict) List of transforms that have been applied to this spectrogram waveform_transform_log: list(dict) List of transforms that have been applied to the waveform before generating this spectrogram """ def __init__(self, data, time_res, type, freq_ax, filename=None, offset=0, label=None, annot=None, transforms=None, transform_log=None, waveform_transform_log=None, **kwargs): super().__init__(data=data, time_res=time_res, filename=filename, offset=offset, label=label, annot=annot, transform_log=transform_log, **kwargs) if waveform_transform_log is None: waveform_transform_log = [] self.freq_ax = freq_ax self.type = type self.decibel = True self.allowed_transforms.update({'blur': self.blur, 'enhance_signal': self.enhance_signal, 'reduce_tonal_noise': self.reduce_tonal_noise, 'resize': self.resize}) self.apply_transforms(transforms) self.waveform_transform_log = waveform_transform_log
[docs] @classmethod def infer_shape(cls, **kwargs): """ Infers the spectrogram shape that would result if the class were instantiated with a specific set of parameter values. Returns a None value if the shape could not be inferred. Accepts the same list of arguments as the `from_wav` method, which is implemented in the child classes. Note: The current implementation involves computing a dummy spectrogram. Therefore, if this method is called repeatedly the computational overhead can become substantial. Returns: : tuple Inferred shape. If the parameter value do not allow the shape be inferred, a None value is returned. """ if 'duration' in kwargs.keys() and 'rate' in kwargs.keys() and hasattr(cls, 'from_waveform'): sr = kwargs['rate'] num_samples = int(kwargs['duration'] * sr) y = np.zeros(num_samples) wf = Waveform(data=y, rate=sr) kwargs.pop('rate', None) kwargs.pop('duration', None) x = cls.from_waveform(wf, **kwargs) return x.get_data().shape else: return None
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'type':self.type, 'waveform_transform_log': self.waveform_transform_log}) return attrs
[docs] def get_kwargs(self): """ Get keyword arguments required to create a copy of this instance. Does not include the data array and annotation handler. """ kwargs = super().get_kwargs() kwargs.update({'freq_ax': self.freq_ax}) return kwargs
[docs] def freq_min(self): """ Get spectrogram minimum frequency in Hz. Returns: : float Frequency in Hz """ return self.freq_ax.min()
[docs] def freq_max(self): """ Get spectrogram maximum frequency in Hz. Returns: : float Frequency in Hz """ return self.freq_ax.max()
[docs] def crop(self, start=None, end=None, length=None,\ freq_min=None, freq_max=None, height=None, make_copy=False): """ Crop spectogram along time axis, frequency axis, or both. Args: start: float Start time in seconds, measured from the left edge of spectrogram. end: float End time in seconds, measured from the left edge of spectrogram. length: int Horizontal size of the cropped image (number of pixels). If provided, the `end` argument is ignored. freq_min: float Lower frequency in Hz. freq_max: str or float Upper frequency in Hz. height: int Vertical size of the cropped image (number of pixels). If provided, the `freq_max` argument is ignored. make_copy: bool Return a cropped copy of the spectrogra. Leaves the present instance unaffected. Default is False. Returns: spec: Spectrogram Cropped spectrogram Examples: >>> import numpy as np >>> import matplotlib.pyplot as plt >>> from ketos.audio.spectrogram import Spectrogram >>> from ketos.audio.utils.axis import LinearAxis >>> # Create a spectrogram with shape (20,30), time resolution of >>> # 0.5 s, random pixel values, and a linear frequency axis from >>> # 0 to 300 Hz, >>> ax = LinearAxis(bins=30, extent=(0.,300.), label='Frequency (Hz)') >>> img = np.random.rand(20,30) >>> spec = Spectrogram(data=img, time_res=0.5, type='Mag', freq_ax=ax) >>> # Draw the spectrogram >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_orig.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_orig.png >>> # Crop the spectrogram along time axis >>> spec1 = spec.crop(start=2.0, end=4.2, make_copy=True) >>> # Draw the spectrogram >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_cropped.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_cropped.png """ spec = super().crop(start=start, end=end, length=length, make_copy=make_copy) #crop time axis # crop frequency axis b1, b2 = spec.freq_ax.cut(x_min=freq_min, x_max=freq_max, bins=height) # add frequency information to log if not make_copy: self.transform_log[-1]['freq_min'] = freq_min self.transform_log[-1]['freq_max'] = freq_max # crop image spec.data = spec.data[:, b1:b2+1] # crop annotations, if any if spec.annot is not None: spec.annot.crop(freq_min=freq_min, freq_max=freq_max) return spec
[docs] def add(self, spec, offset=0, scale=1, make_copy=False): """ Add another spectrogram on top of this spectrogram. The spectrograms must be of the same type, and share the same time resolution. The spectrograms must have consistent frequency axes. For linear frequency axes, this implies having the same resolution; for logarithmic axes with base 2, this implies having the same number of bins per octave minimum values that differ by a factor of :math:`2^{n/m}` where :math:`m` is the number of bins per octave and :math:`n` is any integer. No check is made for the consistency of the frequency axes. Note that the attributes filename, offset, and label of the spectrogram that is being added are lost. The sum spectrogram has the same dimensions (time x frequency) as the original spectrogram. Args: spec: Spectrogram Spectrogram to be added offset: float Shift the spectrograms that is being added by this many seconds relative to the original spectrogram. scale: float Scaling factor applied to spectrogram that is added make_copy: bool Make copies of both spectrograms so as to leave the original instances unchanged. Returns: : Spectrogram Sum spectrogram """ return add_specs(a=self, b=spec, offset=offset, scale=scale, make_copy=make_copy)
[docs] def blur(self, sigma_time, sigma_freq=0): """ Blur the spectrogram using a Gaussian filter. Note that the spectrogram frequency axis must be linear if sigma_freq > 0. This uses the Gaussian filter method from the scipy.ndimage package: https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.gaussian_filter.html Args: sigma_time: float Gaussian kernel standard deviation along time axis in seconds. Must be strictly positive. sigma_freq: float Gaussian kernel standard deviation along frequency axis in Hz. Example: >>> from ketos.audio.spectrogram import Spectrogram >>> from ketos.audio.waveform import Waveform >>> import matplotlib.pyplot as plt >>> # create audio signal >>> s = Waveform.morlet(rate=1000, frequency=300, width=1) >>> # create spectrogram >>> spec = MagSpectrogram.from_waveform(s, window=0.2, step=0.05) >>> # show image >>> fig = spec.plot() >>> plt.close(fig) >>> # apply very small amount (0.01 sec) of horizontal blur >>> # and significant amount of vertical blur (30 Hz) >>> spec.blur(sigma_time=0.01, sigma_freq=30) >>> # show blurred image >>> fig = spec.plot() >>> plt.close(fig) .. image:: ../_static/morlet_spectrogram.png .. image:: ../_static/morlet_spectrogram_blurred.png """ assert sigma_time > 0, "sigma_time must be strictly positive" sig_t = sigma_time / self.time_res() if sigma_freq > 0: assert isinstance(self.freq_ax, LinearAxis), "Frequency axis must be linear when sigma_freq > 0" sig_f = sigma_freq / self.freq_ax.bin_width() else: sig_f = 0 self.data = ndimage.gaussian_filter(input=self.data, sigma=(sig_t, sig_f)) self.transform_log.append({'name':'blur', 'sigma_time':sigma_time, 'sigma_freq':sigma_freq})
[docs] def enhance_signal(self, enhancement=1.): """ Enhance the contrast between regions of high and low intensity. See :func:`audio.image.enhance_image` for implementation details. Args: enhancement: float Parameter determining the amount of enhancement. """ self.data = enhance_signal(self.data, enhancement=enhancement) self.transform_log.append({'name':'enhance_signal', 'enhancement':enhancement})
[docs] def reduce_tonal_noise(self, method='MEDIAN', **kwargs): """ Reduce continuous tonal noise produced by e.g. ships and slowly varying background noise See :func:`audio.image.reduce_tonal_noise` for implementation details. Currently, offers the following two methods: 1. MEDIAN: Subtracts from each row the median value of that row. 2. RUNNING_MEAN: Subtracts from each row the running mean of that row. The running mean is computed according to the formula given in Baumgartner & Mussoline, JASA 129, 2889 (2011); doi: 10.1121/1.3562166 Args: method: str Options are 'MEDIAN' and 'RUNNING_MEAN' Optional args: time_constant: float Time constant in seconds, used for the computation of the running mean. Must be provided if the method 'RUNNING_MEAN' is chosen. Example: >>> # read audio file >>> from ketos.audio.waveform import Waveform >>> aud = Waveform.from_wav('ketos/tests/assets/grunt1.wav') >>> # compute the spectrogram >>> from ketos.audio.spectrogram import MagSpectrogram >>> spec = MagSpectrogram.from_waveform(aud, window=0.2, step=0.02) >>> # keep only frequencies below 800 Hz >>> spec = spec.crop(freq_max=800) >>> # show spectrogram as is >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_before_tonal.png") >>> plt.close(fig) >>> # tonal noise reduction >>> spec.reduce_tonal_noise() >>> # show modified spectrogram >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_after_tonal.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_before_tonal.png .. image:: ../../../ketos/tests/assets/tmp/spec_after_tonal.png """ if 'time_constant' in kwargs.keys(): time_const_len = kwargs['time_constant'] / self.time_ax.bin_width() else: time_const_len = None self.data = reduce_tonal_noise(self.data, method=method, time_const_len=time_const_len) transf = {'name':'reduce_tonal_noise', 'method':method} if 'time_constant' in kwargs.keys(): transf.update({'time_constant': kwargs['time_constant']}) self.transform_log.append(transf)
[docs] def resize(self, shape=None, time_res=None, **kwargs): """ Resize the spectrogram. The resizing operation can be controlled either by specifying the shape of the resized spectrogram or by specifying the desired time resolution. In the latter case, the spectrogram is only resized along the time axis. The resizing operation is performed using the `resize` method of the scikit-image package, which interpolates the pixel values: https://scikit-image.org/docs/dev/api/skimage.transform.html#skimage.transform.resize Use keyword arguments to control the behavior of scikit-image's resize operation. Args: shape: tuple(int,int) Shape of the resized spectrogram time_res: float Time resolution of the resized spectrogram in seconds. Note that the actual time resolution of the resized spectrogram may differ slightly from that specified via the time_res argument, as required to produce an image with an integer number of time bins. Returns: None Example: >>> from ketos.audio.spectrogram import MagSpectrogram >>> # load spectrogram >>> spec = MagSpectrogram.from_wav('ketos/tests/assets/grunt1.wav', window=0.2, step=0.02) >>> # add an annotation >>> spec.annotate(start=1.1, end=1.6, freq_min=70, freq_max=600, label=1) >>> # keep only frequencies below 800 Hz >>> spec = spec.crop(freq_max=800) >>> # make a copy of the current spectrogram, then reduce time resolution by a factor of eight >>> spec_orig = spec.deepcopy() >>> new_time_res = 8.0 * spec.time_res() >>> spec.resize(time_res=new_time_res) >>> # show spectrograms >>> fig = spec_orig.plot(show_annot=True) >>> fig.savefig("ketos/tests/assets/tmp/spec_w_annot_box.png") >>> plt.close(fig) >>> fig = spec.plot(show_annot=True) >>> fig.savefig("ketos/tests/assets/tmp/spec_w_annot_box_reduced_resolution.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_w_annot_box.png .. image:: ../../../ketos/tests/assets/tmp/spec_w_annot_box_reduced_resolution.png """ assert shape is not None or time_res is not None, "either shape or time_res must be specified" transf = {'name':'resize'} #log transform attributes # deduce new shape from time_res argument if shape is None: n_bins = int(self.time_res() / time_res * self.data.shape[0]) shape = (n_bins, self.data.shape[1]) transf.update({'time_res': time_res}) else: transf.update({'shape': shape}) if np.ndim(self.data) == 3: shape = (shape[0], shape[1], self.data.shape[2]) # resize time axis if shape[0] != self.data.shape[0]: self.time_ax.resize(bins=shape[0]) # resize frequency axis if shape[1] != self.data.shape[1]: self.freq_ax.resize(bins=shape[1]) # resize data array self.data = resize(self.data, output_shape=shape, **kwargs) transf.update(kwargs) self.transform_log.append(transf)
[docs] def plot(self, show_annot=False, figsize=(5,4), cmap='viridis', label_in_title=True, vmin=None, vmax=None, annot_kwargs=None): """ Plot the spectrogram with proper axes ranges and labels. Optionally, also display annotations as boxes superimposed on the spectrogram. The colormaps available can be seen here: https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html Note: The resulting figure can be shown (fig.show()) or saved (fig.savefig(file_name)) Args: show_annot: bool Display annotations figsize: tuple Figure size cmap: string The colormap to be used label_in_title: bool Include label (if available) in figure title vmin, vmax : scalar, optional When using scalar data and no explicit norm, vmin and vmax define the data range that the colormap covers. By default, the colormap covers the complete value range of the supplied data. vmin, vmax are ignored if the norm parameter is used. annot_kwargs: dict Annotation box extra parameters following matplotlib values. Only relevant if show_annot is True. The following matplotlib options are currently supported: ============== ======================================================== Property description ============== ======================================================== color color for the annotation box and text. See matplotlib for color options linewidth width for the annotaiton box. float or None fontsize float or {'xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'} fontweight {a numeric value in range 0-1000, 'ultralight', 'light', 'normal', 'regular', 'book', 'medium', 'roman', 'semibold', 'demibold', 'demi', 'bold', 'heavy', 'extra bold', 'black'} ============== ======================================================== A dictionary may be used to specify different options for different label values. For example, {1: {"color": "C0", "fontweight": "bold"},3: {"color": "C2",}} would assign the color "C0" and fontweight bold to label value 1 and "C2" to label value 3. The default color is "C1". Returns: fig: matplotlib.figure.Figure A figure object. Example: >>> from ketos.audio.spectrogram import MagSpectrogram >>> # load spectrogram >>> spec = MagSpectrogram.from_wav('ketos/tests/assets/grunt1.wav', window=0.2, step=0.02) >>> # add an annotation >>> spec.annotate(start=1.1, end=1.6, freq_min=70, freq_max=600, label=1) >>> # keep only frequencies below 800 Hz >>> spec = spec.crop(freq_max=800) >>> # show spectrogram with annotation box >>> fig = spec.plot(show_annot=True) >>> fig.savefig("ketos/tests/assets/tmp/spec_w_annot_box.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_w_annot_box.png """ fig, ax = super().plot(figsize, label_in_title) x = self.get_data() # select image data extent = (0., self.duration(), self.freq_min(), self.freq_max()) # axes ranges img = ax.imshow(x.T, aspect='auto', origin='lower', cmap=cmap, extent=extent, vmin=vmin, vmax=vmax)# draw image ax.set_ylabel(self.freq_ax.label) # axis label if self.decibel: fig.colorbar(img, ax=ax, format='%+2.0f dB')# colobar else: fig.colorbar(img, ax=ax, label='Amplitude')# colobar # superimpose annotation boxes if show_annot: self._draw_annot_boxes(ax, annot_kwargs=annot_kwargs) #fig.tight_layout() return fig
def _draw_annot_boxes(self, ax, annot_kwargs=None): """Draws annotations boxes on top of the spectrogram Args: ax: matplotlib.axes.Axes Axes object annot_kwargs: dict Annotation box extra parameters following matplotlib values. Only relevant if show_annot is True. The following matplotlib options are currently supported: ============== ======================================================== Property description ============== ======================================================== color color for the annotation box and text. See matplotlib for color options linewidth width for the annotaiton box. float or None fontsize float or {'xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'} fontweight {a numeric value in range 0-1000, 'ultralight', 'light', 'normal', 'regular', 'book', 'medium', 'roman', 'semibold', 'demibold', 'demi', 'bold', 'heavy', 'extra bold', 'black'} ============== ======================================================== A dictionary may be used to specify different options for different label values. For example, {1: {"color": "C0", "fontweight": "bold"},3: {"color": "C2",}} would assign the color "C0" and fontweight bold to label value 1 and "C2" to label value 3. The default color is "C1". """ annots = self.get_annotations() if annots is None: return y1 = self.freq_min() y2 = self.freq_max() for idx,annot in annots.iterrows(): l = int(annot['label']) # obs: iterrows does not preserve dtypes across the rows! x1 = annot['start'] x2 = annot['end'] if not np.isnan(annot['freq_min']): y1 = annot['freq_min'] if not np.isnan(annot['freq_max']): y2 = annot['freq_max'] kwargs = {} if annot_kwargs is not None: if isinstance(annot_kwargs, dict) and l in annot_kwargs.keys(): # checking if dict is nested kwargs = annot_kwargs[l] elif isinstance(annot_kwargs, dict): kwargs = annot_kwargs else: raise TypeError("annot_kwargs must be a dict or nested dict.") color = kwargs.get("color", "C1") linewidth = kwargs.get("linewidth", 1) fontsize = kwargs.get("fontsize", None) fontweight = kwargs.get("fontweight", None) box = patches.Rectangle((x1,y1),x2-x1,y2-y1,linewidth=linewidth, edgecolor=color, facecolor='none') ax.add_patch(box) ax.text(x1, y2, int(annot['label']), ha='left', va='bottom', color=color, fontweight=fontweight, fontsize=fontsize)
[docs]class MagSpectrogram(Spectrogram): """ Magnitude Spectrogram. While the underlying data array can be accessed via the :attr:`data` attribute, it is recommended to always use the :func:`get_data` function to access the data array, i.e., >>> from ketos.audio.base_audio import BaseAudio >>> x = np.ones(6) >>> audio_sample = BaseAudio(data=x) >>> audio_sample.get_data() array([1., 1., 1., 1., 1., 1.]) Args: data: numpy array Magnitude spectrogram. time_res: float Time resolution in seconds (corresponds to the bin size used on the time axis) freq_min: float Lower value of the frequency axis in Hz freq_res: float Frequency resolution in Hz (corresponds to the bin size used on the frequency axis) window_func: str Window function used for computing the spectrogram filename: str or list(str) Name of the source audio file, if available. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file, if available. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} transform_log: list(dict) List of transforms that have been applied to this spectrogram waveform_transform_log: list(dict) List of transforms that have been applied to the waveform before generating this spectrogram phase_angle: numpy.array Complex phase angle. Attrs: data: numpy array If the phase angle matrix is not provided, data will be a 2d numpy array containing the magnitude spectrogram. On the other hand, if the phase angle matrix is provided, data will be a 3d numpy array where data[:,:,0] contains the magnitude spectrogram and data[:,:,1] contains the complex phase angle. window_func: str Window function. """ def __init__(self, data, time_res, freq_min, freq_res, window_func=None, filename=None, offset=0, label=None, annot=None, transforms=None, transform_log=None, waveform_transform_log=None, phase_angle=None, **kwargs): # create frequency axis freq_bins = max(1, data.shape[1]) freq_max = freq_min + data.shape[1] * freq_res ax = LinearAxis(bins=freq_bins, extent=(freq_min, freq_max), label='Frequency (Hz)') if phase_angle is not None: assert phase_angle.shape == data.shape, 'phase_angle and data array must have same shape' data = np.stack([data, phase_angle], axis=2) # create spectrogram kwargs.pop('type', None) super().__init__(data=data, time_res=time_res, type=self.__class__.__name__, freq_ax=ax, filename=filename, offset=offset, label=label, annot=annot, transforms=transforms, transform_log=transform_log, waveform_transform_log=waveform_transform_log, **kwargs) self.window_func = window_func
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'freq_min':self.freq_min(), 'freq_res':self.freq_res(), 'window_func':self.window_func}) return attrs
[docs] def get_kwargs(self): """ Get keyword arguments required to create a copy of this instance. Does not include the data array and annotation handler. """ kwargs = super().get_kwargs() kwargs.pop('freq_ax', None) return kwargs
[docs] def get_data(self): """ Get magnitude spectrogram data """ if np.ndim(self.data) == 3: return self.data[:,:,0] else: return super().get_data()
[docs] def get_phase_angle(self): """ Get magnitude spectrogram complex phase angle, if available """ if np.ndim(self.data) == 3: return self.data[:,:,1] else: return None
[docs] @classmethod def empty(cls): """ Creates an empty MagSpectrogram object """ return cls(data=np.empty(shape=(0,0), dtype=np.float64), time_res=0, freq_min=0, freq_res=0)
[docs] @classmethod def from_waveform(cls, audio, window=None, step=None, seg_args=None, window_func='hamming', freq_min=None, freq_max=None, transforms=None, compute_phase=False, decibel=True, **kwargs): """ Create a Magnitude Spectrogram from an :class:`audio_signal.Waveform` by computing the Short Time Fourier Transform (STFT). Args: audio: Waveform Audio signal window: float Window length in seconds step: float Step size in seconds seg_args: dict Input arguments used for evaluating :func:`audio.audio.segment_args`. Optional. If specified, the arguments `window` and `step` are ignored. window_func: str Window function (optional). Select between * bartlett * blackman * hamming (default) * hanning freq_min: float Lower frequency in Hz. freq_max: str or float Upper frequency in Hz. transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} compute_phase: bool Compute complex phase angle. Default it False decibel: bool Convert to dB scale Returns: spec: MagSpectrogram Magnitude spectrogram """ if window_func is not None: window_func = window_func.lower() #make lowercase # compute STFT img, freq_nyquist, num_fft, seg_args, phase = aum.stft(x=audio.data, rate=audio.rate, window=window, step=step, seg_args=seg_args, window_func=window_func, compute_phase=compute_phase, decibel=decibel) time_res = seg_args['step_len'] / audio.rate freq_res = freq_nyquist / img.shape[1] spec = cls(data=img, time_res=time_res, freq_min=0, freq_res=freq_res, window_func=window_func, filename=audio.filename, offset=audio.offset, label=audio.label, annot=audio.annot, waveform_transform_log=audio.transform_log, transforms=transforms, phase_angle=phase, **kwargs) # Saving decibel option spec.decibel = decibel if freq_min is not None or freq_max is not None: spec = spec.crop(freq_min=freq_min, freq_max=freq_max) return spec
[docs] @classmethod def from_wav(cls, path, window, step, channel=0, rate=None, window_func='hamming', offset=0, duration=None, resample_method='scipy', freq_min=None, freq_max=None, id=None, normalize_wav=False, transforms=None, waveform_transforms=None, compute_phase=False, decibel=True, smooth=0.01, **kwargs): """ Create magnitude spectrogram directly from wav file. The arguments offset and duration can be used to select a portion of the wav file. Note that values specified for the arguments window, step, offset, and duration may all be subject to slight adjustments to ensure that the selected portion corresponds to an integer number of window frames, and that the window and step sizes correspond to an integer number of samples. Args: path: str Path to wav file window: float Window size in seconds step: float Step size in seconds channel: int Channel to read from. Only relevant for stereo recordings rate: float Desired sampling rate in Hz. If None, the original sampling rate will be used window_func: str Window function (optional). Select between * bartlett * blackman * hamming (default) * hanning offset: float Start time of spectrogram in seconds, relative the start of the wav file. duration: float Length of spectrogram in seconds. resample_method: str Resampling method. Only relevant if `rate` is specified. Options are * kaiser_best * kaiser_fast * scipy (default) * polyphase See https://librosa.github.io/librosa/generated/librosa.core.resample.html for details on the individual methods. freq_min: float Lower frequency in Hz. freq_max: str or float Upper frequency in Hz. id: str Unique identifier (optional). If None, the filename will be used. normalize_wav: bool Normalize the waveform to have a mean of zero (mean=0) and a standard deviation of unity (std=1) before computing the spectrogram. Default is False. transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} waveform_transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the waveform before generating the spectrogram. For example, {"name":"add_gaussian_noise", "sigma":0.5} compute_phase: bool Compute complex phase angle. Default it False decibel: bool Convert to dB scale smooth: float Width in seconds of the smoothing region used for stitching together audio files. \**kwargs: additional keyword arguments Keyword arguments to be passed to :meth:`ketos.audio.spectrogram.load_audio_for_spec` and :meth:`ketos.audio.waveform.from_waveform`. Returns: : MagSpectrogram Magnitude spectrogram Example: >>> # load spectrogram from wav file >>> from ketos.audio.spectrogram import MagSpectrogram >>> spec = MagSpectrogram.from_wav('ketos/tests/assets/grunt1.wav', window=0.2, step=0.01) >>> # crop frequency >>> spec = spec.crop(freq_min=50, freq_max=800) >>> # show >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_grunt1.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_grunt1.png """ # load audio audio = load_audio_for_spec(path=path, channel=channel, rate=rate, window=window, step=step, offset=offset, duration=duration, resample_method=resample_method, id=id, normalize_wav=normalize_wav, waveform_transforms=waveform_transforms, smooth=smooth, **kwargs) if audio is None: warnings.warn("Empty spectrogram returned", RuntimeWarning) return cls.empty() # compute spectrogram return cls.from_waveform(audio=audio, seg_args=audio.stft_args, window_func=window_func, freq_min=freq_min, freq_max=freq_max, transforms=transforms, compute_phase=compute_phase, decibel=decibel, **kwargs)
[docs] def freq_res(self): """ Get frequency resolution in Hz. Returns: : float Frequency resolution in Hz """ return self.freq_ax.bin_width()
[docs] def recover_waveform(self, num_iters=25, phase_angle=None, subtract=0): """ Estimate audio signal from magnitude spectrogram. Uses :func:`audio.audio.spec2wave`. Args: num_iters: Number of iterations to perform. phase_angle: Initial condition for phase in radians. If not specified, the phase angle computed computed at initialization will be used, if available. If not available, the phase angle will default to zero and a warning will be printed. Returns: : Waveform Audio signal """ mag = self.get_data() if phase_angle is None: phase_angle = self.get_phase_angle() if phase_angle is None: phase_angle = 0 print('Warning: spectrogram phase angle not available; phase will be set to zero everywhere') # if the frequency axis has been cropped, pad with zeros to ensure that # the spectrogram has the expected shape pad_low = max(0, int(self.freq_min() / self.freq_res())) if pad_low > 0: mag = np.pad(mag, pad_width=((0,0),(pad_low,0)), mode='constant') if np.ndim(phase_angle) == 2: phase_angle = np.pad(phase_angle, pad_width=((0,0),(pad_low,0)), mode='constant') #use linear scale mag = aum.from_decibel(mag) - subtract target_rate = self.freq_ax.bin_width() * 2 * mag.shape[1] # retrieve settings used for computing STFT num_fft = 2 * (mag.shape[1] - 1) step_len = int(target_rate * self.time_res()) #self.seg_args['step_len'] if self.window_func: window_func = get_window(self.window_func, num_fft) else: window_func = np.ones(num_fft) # iteratively estimate audio signal audio = aum.spec2wave(image=mag, phase_angle=phase_angle, num_fft=num_fft,\ step_len=step_len, num_iters=num_iters, window_func=window_func) # sampling rate of recovered audio signal rate = len(audio) / (self.duration() + (num_fft - step_len) / target_rate) # crop at both ends to obtain correct length for waveform num_samples = int(self.duration() * rate) num_cut = int(0.5 * (num_fft - step_len)) audio = audio[num_cut:num_cut+num_samples] return Waveform(rate=rate, data=audio)
[docs] def plot_phase_angle(self, figsize=(5,4), cmap='viridis'): """ Plot the complex phase matrix. Returns None if the complex phase has not been computed. Set compute_phase=True when you initialize the spectrogram to ensure that the phase is computed. Note: The resulting figure can be shown (fig.show()) or saved (fig.savefig(file_name)) Args: figsize: tuple Figure size cmap: string The colormap to be used. The colormaps available can be seen here: https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html Returns: fig: matplotlib.figure.Figure A figure object. """ fig, ax = super(Spectrogram, self).plot(figsize) x = self.get_phase_angle() # select image data if x is None: warnings.warn(f"The complex phase angle has not been computed and can therefore not be plotted. "\ "Make sure to initialize the spectrogram with compute_phase=True to be able to plot the phase.", category=UserWarning) return None extent = (0., self.duration(), self.freq_min(), self.freq_max()) # axes ranges img = ax.imshow(x.T, aspect='auto', origin='lower', cmap=cmap, extent=extent)# draw image ax.set_ylabel(self.freq_ax.label) # axis label fig.colorbar(img, ax=ax)# colobar return fig
[docs]class PowerSpectrogram(Spectrogram): """ Power Spectrogram. Args: data: 2d or 3d numpy array Spectrogram pixel values. time_res: float Time resolution in seconds (corresponds to the bin size used on the time axis) freq_min: float Lower value of the frequency axis in Hz freq_res: float Frequency resolution in Hz (corresponds to the bin size used on the frequency axis) window_func: str Window function used for computing the spectrogram filename: str or list(str) Name of the source audio file, if available. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file, if available. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} transform_log: list(dict) List of transforms that have been applied to this spectrogram waveform_transform_log: list(dict) List of transforms that have been applied to the waveform before generating this spectrogram Attrs: window_func: str Window function. """ def __init__(self, data, time_res, freq_min, freq_res, window_func=None, filename=None, offset=0, label=None, annot=None, transforms=None, transform_log=None, waveform_transform_log=None, **kwargs): # create frequency axis freq_bins = data.shape[1] freq_max = freq_min + freq_bins * freq_res ax = LinearAxis(bins=freq_bins, extent=(freq_min, freq_max), label='Frequency (Hz)') # create spectrogram kwargs.pop('type', None) super().__init__(data=data, time_res=time_res, type=self.__class__.__name__, freq_ax=ax, filename=filename, offset=offset, label=label, annot=annot, transforms=transforms, transform_log=transform_log, waveform_transform_log=waveform_transform_log, **kwargs) self.window_func = window_func
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'freq_min':self.freq_min(), 'freq_res':self.freq_res(), 'window_func':self.window_func}) return attrs
[docs] def get_kwargs(self): """ Get keyword arguments required to create a copy of this instance. Does not include the data array and annotation handler. """ kwargs = super().get_kwargs() kwargs.pop('freq_ax', None) return kwargs
[docs] @classmethod def empty(cls): """ Creates an empty PowerSpectrogram object """ return cls(data=np.empty(shape=(0,0), dtype=np.float64), time_res=0, freq_min=0, freq_res=0)
[docs] @classmethod def from_waveform(cls, audio, window=None, step=None, seg_args=None, window_func='hamming', freq_min=None, freq_max=None, transforms=None, decibel=True, **kwargs): """ Create a Power Spectrogram from an :class:`audio_signal.Waveform` by computing the Short Time Fourier Transform (STFT). Args: audio: Waveform Audio signal window: float Window length in seconds step: float Step size in seconds seg_args: dict Input arguments used for evaluating :func:`audio.audio.segment_args`. Optional. If specified, the arguments `window` and `step` are ignored. window_func: str Window function (optional). Select between * bartlett * blackman * hamming (default) * hanning freq_min: float Lower frequency in Hz. freq_max: str or float Upper frequency in Hz. transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} decibel: bool Convert to dB scale Returns: : MagSpectrogram Magnitude spectrogram """ if window_func is not None: window_func = window_func.lower() #make lowercase # compute STFT img, freq_nyquist, num_fft, seg_args, phase = aum.stft(x=audio.data, rate=audio.rate, window=window,\ step=step, seg_args=seg_args, window_func=window_func, decibel=False) img = aum.mag2pow(img, num_fft) # Magnitude->Power conversion if decibel: img = aum.to_decibel(img) # convert to dB time_res = seg_args['step_len'] / audio.rate freq_res = freq_nyquist / img.shape[1] spec = cls(data=img, time_res=time_res, freq_min=0, freq_res=freq_res, window_func=window_func, filename=audio.filename, offset=audio.offset, label=audio.label, annot=audio.annot, waveform_transform_log=audio.transform_log, transforms=transforms, **kwargs) # Saving decibel choice spec.decibel = decibel if freq_min is not None or freq_max is not None: spec = spec.crop(freq_min=freq_min, freq_max=freq_max) return spec
[docs] @classmethod def from_wav(cls, path, window, step, channel=0, rate=None, window_func='hamming', offset=0, duration=None, resample_method='scipy', freq_min=None, freq_max=None, id=None, normalize_wav=False, transforms=None, waveform_transforms=None, decibel=True, smooth=0.01, **kwargs): """ Create power spectrogram directly from wav file. The arguments offset and duration can be used to select a portion of the wav file. Note that values specified for the arguments window, step, offset, and duration may all be subject to slight adjustments to ensure that the selected portion corresponds to an integer number of window frames, and that the window and step sizes correspond to an integer number of samples. Args: path: str Path to wav file window: float Window size in seconds step: float Step size in seconds channel: int Channel to read from. Only relevant for stereo recordings rate: float Desired sampling rate in Hz. If None, the original sampling rate will be used window_func: str Window function (optional). Select between * bartlett * blackman * hamming (default) * hanning offset: float Start time of spectrogram in seconds, relative the start of the wav file. duration: float Length of spectrogrma in seconds. resample_method: str Resampling method. Only relevant if `rate` is specified. Options are * kaiser_best * kaiser_fast * scipy (default) * polyphase See https://librosa.github.io/librosa/generated/librosa.core.resample.html for details on the individual methods. freq_min: float Lower frequency in Hz. freq_max: str or float Upper frequency in Hz. id: str Unique identifier (optional). If None, the filename will be used. normalize_wav: bool Normalize the waveform to have a mean of zero (mean=0) and a standard deviation of unity (std=1) before computing the spectrogram. Default is False. transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} waveform_transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the waveform before generating the spectrogram. For example, {"name":"add_gaussian_noise", "sigma":0.5} decibel: bool Convert to dB scale smooth: float Width in seconds of the smoothing region used for stitching together audio files. Returns: spec: MagSpectrogram Magnitude spectrogram Example: >>> # load spectrogram from wav file >>> from ketos.audio.spectrogram import MagSpectrogram >>> spec = MagSpectrogram.from_wav('ketos/tests/assets/grunt1.wav', window=0.2, step=0.01) >>> # crop frequency >>> spec = spec.crop(freq_min=50, freq_max=800) >>> # show >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_grunt1.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_grunt1.png """ # load audio audio = load_audio_for_spec(path=path, channel=channel, rate=rate, window=window, step=step, offset=offset, duration=duration, resample_method=resample_method, id=id, normalize_wav=normalize_wav, waveform_transforms=waveform_transforms) if audio is None: warnings.warn("Empty spectrogram returned", RuntimeWarning) return cls.empty() # compute spectrogram return cls.from_waveform(audio=audio, seg_args=audio.stft_args, window_func=window_func, freq_min=freq_min, freq_max=freq_max, transforms=transforms, decibel=decibel, **kwargs)
[docs] def freq_res(self): """ Get frequency resolution in Hz. Returns: : float Frequency resolution in Hz """ return self.freq_ax.bin_width()
[docs]class MelSpectrogram(Spectrogram): """ Mel Spectrogram. Args: data: 2d numpy array Mel spectrogram pixel values. num_filters: int The number of filters in the filter bank. time_res: float Time resolution in seconds (corresponds to the bin size used on the time axis) freq_max: float Maximum frequency in Hz window_func: str Window function used for computing the spectrogram filename: str or list(str) Name of the source audio file, if available. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file, if available. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} transform_log: list(dict) List of transforms that have been applied to this spectrogram waveform_transform_log: list(dict) List of transforms that have been applied to the waveform before generating this spectrogram Attrs: window_func: str Window function. """ def __init__(self, data, num_filters, time_res, freq_max, start_bin=0, bins=None, window_func=None, filename=None, offset=0, label=None, annot=None, transforms=None, transform_log=None, waveform_transform_log=None, **kwargs): # create frequency axis ax = MelAxis(num_filters=num_filters, freq_max=freq_max, start_bin=start_bin, bins=bins, label='Frequency (Hz)') # create spectrogram kwargs.pop('type', None) super().__init__(data=data, time_res=time_res, type=self.__class__.__name__, freq_ax=ax, filename=filename, offset=offset, label=label, annot=annot, transforms=transforms, transform_log=transform_log, waveform_transform_log=waveform_transform_log, **kwargs) self.window_func = window_func
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'num_filters':self.freq_ax.num_filters, 'freq_max':self.freq_ax.freq_max, 'start_bin':self.freq_ax.start_bin, 'bins':self.freq_ax.bins, 'window_func':self.window_func}) return attrs
[docs] def get_kwargs(self): """ Get keyword arguments required to create a copy of this instance. Does not include the data array and annotation handler. """ kwargs = super().get_kwargs() kwargs.pop('freq_ax', None) return kwargs
[docs] @classmethod def empty(cls): """ Creates an empty MelSpectrogram object """ return cls(data=np.empty(shape=(0,0), dtype=np.float64), num_filters=40, time_res=1, freq_min=0, freq_max=0)
[docs] @classmethod def from_waveform(cls, audio, window=None, step=None, seg_args=None, window_func='hamming', num_filters=40, transforms=None, **kwargs): """ Creates a Mel Spectrogram from an :class:`audio_signal.Waveform`. Args: audio: Waveform Audio signal window: float Window length in seconds step: float Step size in seconds seg_args: dict Input arguments used for evaluating :func:`audio.audio.segment_args`. Optional. If specified, the arguments `window` and `step` are ignored. window_func: str Window function (optional). Select between * bartlett * blackman * hamming (default) * hanning num_filters: int The number of filters in the filter bank. Default is 40. transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} Returns: : MelSpectrogram Mel spectrogram """ if window_func is not None: window_func = window_func.lower() #make lowercase # compute STFT img, freq_nyquist, num_fft, seg_args, phase = aum.stft(x=audio.data, rate=audio.rate, window=window, step=step, seg_args=seg_args, window_func=window_func, decibel=False) # Magnitude->Mel conversion img = aum.mag2mel(img=img, num_fft=num_fft, rate=audio.rate, num_filters=num_filters) img = np.where(img == 0, np.finfo(float).eps, img) #Numerical Stability img = aum.to_decibel(img) # convert to dB time_res = seg_args['step_len'] / audio.rate return cls(data=img, num_filters=num_filters, time_res=time_res, freq_max=audio.rate/2, window_func=window_func, filename=audio.filename, offset=audio.offset, label=audio.label, annot=audio.annot, waveform_transform_log=audio.transform_log, transforms=transforms, **kwargs)
[docs] @classmethod def from_wav(cls, path, window, step, channel=0, rate=None, window_func='hamming', num_filters=40, offset=0, duration=None, resample_method='scipy', id=None, normalize_wav=False, transforms=None, waveform_transforms=None, smooth=0.01, **kwargs): """ Create Mel spectrogram directly from wav file. The arguments offset and duration can be used to select a portion of the wav file. Note that values specified for the arguments window, step, offset, and duration may all be subject to slight adjustments to ensure that the selected portion corresponds to an integer number of window frames, and that the window and step sizes correspond to an integer number of samples. Args: path: str Path to wav file window: float Window size in seconds step: float Step size in seconds channel: int Channel to read from. Only relevant for stereo recordings rate: float Desired sampling rate in Hz. If None, the original sampling rate will be used window_func: str Window function (optional). Select between * bartlett * blackman * hamming (default) * hanning num_filters: int The number of filters in the filter bank. Default is 40. offset: float Start time of spectrogram in seconds, relative the start of the wav file. duration: float Length of spectrogrma in seconds. resample_method: str Resampling method. Only relevant if `rate` is specified. Options are * kaiser_best * kaiser_fast * scipy (default) * polyphase See https://librosa.github.io/librosa/generated/librosa.core.resample.html for details on the individual methods. id: str Unique identifier (optional). If None, the filename will be used. normalize_wav: bool Normalize the waveform to have a mean of zero (mean=0) and a standard deviation of unity (std=1) before computing the spectrogram. Default is False. transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} waveform_transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the waveform before generating the spectrogram. For example, {"name":"add_gaussian_noise", "sigma":0.5} smooth: float Width in seconds of the smoothing region used for stitching together audio files. Returns: spec: MelSpectrogram Mel spectrogram Example: >>> # load spectrogram from wav file >>> from ketos.audio.spectrogram import MelSpectrogram >>> spec = MelSpectrogram.from_wav('ketos/tests/assets/grunt1.wav', window=0.2, step=0.01) >>> # crop frequency >>> spec = spec.crop(freq_min=50, freq_max=800) >>> # show >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/mel_grunt1.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/mel_grunt1.png """ # load audio audio = load_audio_for_spec(path=path, channel=channel, rate=rate, window=window, step=step, offset=offset, duration=duration, resample_method=resample_method, id=id, normalize_wav=normalize_wav, waveform_transforms=waveform_transforms, smooth=smooth) if audio is None: warnings.warn("Empty spectrogram returned", RuntimeWarning) return cls.empty() # compute spectrogram spec = cls.from_waveform(audio=audio, seg_args=audio.stft_args, window_func=window_func, num_filters=num_filters, transforms=transforms, **kwargs) return spec
[docs] def plot(self, show_annot=False, figsize=(5,4), cmap='viridis', label_in_title=True, vmin=None, vmax=None, num_labels=5): """ Plot the spectrogram with proper axes ranges and labels. The colormaps available can be seen here: https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html Note: The resulting figure can be shown (fig.show()) or saved (fig.savefig(file_name)) TODO: Check implementation for filter_bank=True Args: show_annot: bool Display annotations figsize: tuple Figure size cmap: string The colormap to be used label_in_title: bool Include label (if available) in figure title num_labels: int Number of labels Returns: fig: matplotlib.figure.Figure A figure object. """ fig = super().plot(show_annot, figsize, cmap, label_in_title, vmin, vmax) num = min(self.get_data().shape[1] + 1, num_labels) ticks, labels = self.freq_ax.ticks_and_labels(num_labels=num) plt.yticks(ticks, labels) return fig
[docs]class CQTSpectrogram(Spectrogram): """ Magnitude Spectrogram computed from Constant Q Transform (CQT). Args: image: 2d or 3d numpy array Spectrogram pixel values. time_res: float Time resolution in seconds (corresponds to the bin size used on the time axis) freq_min: float Lower value of the frequency axis in Hz bins_per_oct: int Number of bins per octave window_func: str Window function used for computing the spectrogram filename: str or list(str) Name of the source audio file, if available. offset: float or array-like Position in seconds of the left edge of the spectrogram within the source audio file, if available. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} transform_log: list(dict) List of transforms that have been applied to this spectrogram waveform_transform_log: list(dict) List of transforms that have been applied to the waveform before generating this spectrogram Attrs: window_func: str Window function. """ def __init__(self, data, time_res, bins_per_oct, freq_min, window_func=None, filename=None, offset=0, label=None, annot=None, transforms=None, transform_log=None, waveform_transform_log=None, **kwargs): # create logarithmic frequency axis ax = Log2Axis(bins=data.shape[1], bins_per_oct=bins_per_oct,\ min_value=freq_min, label='Frequency (Hz)') # create spectrogram kwargs.pop('type', None) super().__init__(data=data, time_res=time_res, type=self.__class__.__name__, freq_ax=ax, filename=filename, offset=offset, label=label, annot=annot, transforms=transforms, transform_log=transform_log, waveform_transform_log=waveform_transform_log, **kwargs) self.window_func = window_func
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'freq_min':self.freq_min(), 'bins_per_oct':self.bins_per_octave(), 'window_func':self.window_func}) return attrs
[docs] def get_kwargs(self): """ Get keyword arguments required to create a copy of this instance. Does not include the data array and annotation handler. """ kwargs = super().get_kwargs() kwargs.pop('freq_ax', None) return kwargs
[docs] @classmethod def empty(cls): """ Creates an empty CQTSpectrogram object """ return cls(data=np.empty(shape=(0,0), dtype=np.float64), time_res=0, bins_per_oct=0, freq_min=0)
[docs] @classmethod def from_waveform(cls, audio, step, bins_per_oct, freq_min=1, freq_max=None, window_func='hann', transforms=None, **kwargs): """ Magnitude Spectrogram computed from Constant Q Transform (CQT) using the librosa implementation: https://librosa.github.io/librosa/generated/librosa.core.cqt.html The frequency axis of a CQT spectrogram is essentially a logarithmic axis with base 2. It is characterized by an integer number of bins per octave (an octave being a doubling of the frequency.) For further details, see :func:`audio.audio.cqt`. Args: audio: Waveform Audio signal step: float Step size in seconds bins_per_oct: int Number of bins per octave freq_min: float Minimum frequency in Hz. Default is 1 Hz. freq_max: float Maximum frequency in Hz If None, it is set half the sampling rate. window_func: str Window function (optional). Select between * bartlett * blackman * hamming * hanning (default) transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} Returns: spec: CQTSpectrogram CQT spectrogram """ if window_func is not None: window_func = window_func.lower() #make lowercase # compute CQT img, step = aum.cqt(x=audio.data, rate=audio.rate, step=step, bins_per_oct=bins_per_oct, freq_min=freq_min, freq_max=freq_max, window_func=window_func) spec = cls(data=img, time_res=step, freq_min=freq_min, bins_per_oct=bins_per_oct, window_func=window_func, filename=audio.filename, offset=audio.offset, label=audio.label, annot=audio.annot, waveform_transform_log=audio.transform_log, transforms=transforms, **kwargs) if freq_min is not None or freq_max is not None: spec = spec.crop(freq_min=freq_min, freq_max=freq_max) return spec
[docs] @classmethod def from_wav(cls, path, step, bins_per_oct, freq_min=1, freq_max=None, channel=0, rate=None, window_func='hann', offset=0, duration=None, resample_method='scipy', id=None, normalize_wav=False, transforms=None, waveform_transforms=None, smooth=0.01, **kwargs): """ Create CQT spectrogram directly from wav file. The arguments offset and duration can be used to select a segment of the audio file. Note that values specified for the arguments window, step, offset, and duration may all be subject to slight adjustments to ensure that the selected portion corresponds to an integer number of window frames, and that the window and step sizes correspond to an integer number of samples. Args: path: str Complete path to wav file step: float Step size in seconds bins_per_oct: int Number of bins per octave freq_min: float Minimum frequency in Hz. Default is 1 Hz. freq_max: float Maximum frequency in Hz If None, it is set half the sampling rate. channel: int Channel to read from. Only relevant for stereo recordings rate: float Desired sampling rate in Hz. If None, the original sampling rate will be used window_func: str Window function (optional). Select between * bartlett * blackman * hamming (default) * hanning offset: float Start time of spectrogram in seconds, relative the start of the wav file. duration: float Length of spectrogrma in seconds. resample_method: str Resampling method. Only relevant if `rate` is specified. Options are * kaiser_best * kaiser_fast * scipy (default) * polyphase See https://librosa.github.io/librosa/generated/librosa.core.resample.html for details on the individual methods. id: str Unique identifier (optional). If None, the filename will be used. normalize_wav: bool Normalize the waveform to have a mean of zero (mean=0) and a standard deviation of unity (std=1) before computing the spectrogram. Default is False. transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the spectrogram. For example, {"name":"normalize", "mean":0.5, "std":1.0} waveform_transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation to be applied to the waveform before generating the spectrogram. For example, {"name":"add_gaussian_noise", "sigma":0.5} smooth: float Width in seconds of the smoothing region used for stitching together audio files. Returns: : CQTSpectrogram CQT spectrogram Example: >>> # load spectrogram from wav file >>> from ketos.audio.spectrogram import CQTSpectrogram >>> spec = CQTSpectrogram.from_wav('ketos/tests/assets/grunt1.wav', step=0.01, freq_min=10, freq_max=800, bins_per_oct=16) >>> # show >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/cqt_grunt1.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/cqt_grunt1.png """ # load audio audio = Waveform.from_wav(path=path, rate=rate, channel=channel, offset=offset, duration=duration, resample_method=resample_method, id=id, normalize_wav=normalize_wav, transforms=waveform_transforms, smooth=smooth) if len(audio.get_data()) == 0: warnings.warn("Empty spectrogram returned", RuntimeWarning) return cls.empty() # create CQT spectrogram return cls.from_waveform(audio=audio, step=step, bins_per_oct=bins_per_oct, freq_min=freq_min, freq_max=freq_max, window_func=window_func, transforms=transforms, **kwargs)
[docs] def bins_per_octave(self): """ Get no. bins per octave. Returns: : int No. bins per octave. """ return self.freq_ax.bins_per_oct
[docs] def plot(self, show_annot=False, figsize=(5,4), cmap='viridis', label_in_title=True, vmin=None, vmax=None): """ Plot the spectrogram with proper axes ranges and labels. Optionally, also display annotations as boxes superimposed on the spectrogram. The colormaps available can be seen here: https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html Note: The resulting figure can be shown (fig.show()) or saved (fig.savefig(file_name)) Args: show_annot: bool Display annotations figsize: tuple Figure size cmap: string The colormap to be used label_in_title: bool Include label (if available) in figure title Returns: fig: matplotlib.figure.Figure A figure object. """ fig = super().plot(show_annot, figsize, cmap, label_in_title, vmin, vmax) ticks, labels = self.freq_ax.ticks_and_labels() plt.yticks(ticks, labels) return fig