Source code for ketos.audio.base_audio

# ================================================================================ #
#   Authors: Fabio Frazao and Oliver Kirsebom                                      #
#   Contact: fsfrazao@dal.ca, oliver.kirsebom@dal.ca                               #
#   Organization: MERIDIAN (https://meridian.cs.dal.ca/)                           #
#   Team: Data Analytics                                                           #
#   Project: ketos                                                                 #
#   Project goal: The ketos library provides functionalities for handling          #
#   and processing acoustic data and applying deep neural networks to sound        #
#   detection and classification tasks.                                            #
#                                                                                  #
#   License: GNU GPLv3                                                             #
#                                                                                  #
#       This program is free software: you can redistribute it and/or modify       #
#       it under the terms of the GNU General Public License as published by       #
#       the Free Software Foundation, either version 3 of the License, or          #
#       (at your option) any later version.                                        #
#                                                                                  #
#       This program is distributed in the hope that it will be useful,            #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#       GNU General Public License for more details.                               # 
#                                                                                  #
#       You should have received a copy of the GNU General Public License          #
#       along with this program.  If not, see <https://www.gnu.org/licenses/>.     #
# ================================================================================ #

""" 'audio.base_audio' module within the ketos library

    This module contains the base class for the Waveform and Spectrogram classes.

    Contents:
        BaseAudio class;
        BaseAudioTimeAxis class
"""
import os
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ketos.audio.utils.misc as aum
from ketos.audio.annotation import AnnotationHandler, stack_annotations
from ketos.audio.utils.axis import LinearAxis


[docs] def segment_data(x, window, step=None): """ Divide the time axis into segments of uniform length, which may or may not be overlapping. Window length and step size are converted to the nearest integer number of time steps. If necessary, the data array will be padded with zeros at the end to ensure that all segments have an equal number of samples. Args: x: BaseAudioTime Data to be segmented window: float Length of each segment in seconds. step: float Step size in seconds. Returns: audio_objects: list(BaseAudioTime) Data segments """ if step is None: step = window time_res = x.time_res() win_len = aum.num_samples(window, 1. / time_res) step_len = aum.num_samples(step, 1. / time_res) # segment data array segs = aum.segment(x=x.data, win_len=win_len, step_len=step_len, pad_mode='zero') window = win_len * time_res step = step_len * time_res num_segs = segs.shape[0] # segment annotations if x.annot is not None: annots = x.annot.segment(num_segs=num_segs, window=window, step=step) else: annots = None # compute offsets offsets = np.arange(num_segs) * step # add global offset offsets += x.offset # create audio objects audio_objects = [] for i in range(segs.shape[0]): if annots is not None: annot = annots.get(id=i) else: annot = None kwargs = x.get_kwargs() kwargs.pop('offset', None) audio_objects.append(x.__class__(data=segs[i], annot=annot, offset=offsets[i], **kwargs)) return audio_objects
[docs] class BaseAudio(): """ Parent class for all audio classes. While the underlying data array can be accessed via the :attr:`data` attribute, it is recommended to always use the :func:`get_data` function to access the data array, i.e., >>> from ketos.audio.base_audio import BaseAudio >>> x = np.ones(6) >>> audio_sample = BaseAudio(data=x) >>> audio_sample.get_data() array([1., 1., 1., 1., 1., 1.]) Args: data: numpy array Data filename: str Filename of the original data file, if available (optional) offset: float Position within the original data file, in seconds measured from the start of the file. Defaults to 0 if not specified. duration: float Duration in seconds. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation and its arguments, if any. For example, {"name":"normalize", "mean":0.5, "std":1.0} Attributes: data: numpy array Data ndim: int Dimensionality of data. filename: str Filename of the original data file, if available (optional) offset: float Position within the original data file, in seconds measured from the start of the file. Defaults to 0 if not specified. label: int Data label. annot: AnnotationHandler or pandas DataFrame AnnotationHandler object. allowed_transforms: dict Transforms that can be applied via the apply_transform method transform_log: list List of transforms that have been applied to this object """ def __init__(self, data, filename='', offset=0, duration=None, label=None, annot=None, transforms=None, transform_log=None, **kwargs): if transform_log is None: transform_log = [] if isinstance(annot, pd.DataFrame): annot = AnnotationHandler(annot) self.ndim = np.ndim(data) self.data = data self.filename = filename self.offset = offset self._duration = duration self.label = label self.annot = annot self.allowed_transforms = {'normalize': self.normalize, 'adjust_range': self.adjust_range} self.transform_log = transform_log self.apply_transforms(transforms) self.kwargs = kwargs
[docs] @staticmethod def infer_shape(**kwargs): """ Infers the data shape that would result if the class were instantiated with a specific set of parameter values. Returns a None value if `duration` or `rate` are not specified. Args: duration: float Duration in seconds rate: float Sampling rate in Hz Returns: : tuple Inferred shape. If the parameter value do not allow the shape be inferred, a None value is returned. """ if 'duration' in kwargs.keys() and 'rate' in kwargs.keys(): num_samples = int(kwargs['duration'] * kwargs['rate']) return (num_samples,) else: return None
[docs] def get(self): """ Get a copy of this instance """ return self.__class__(data=self.get_data(), annot=self.get_annotations(), **self.get_kwargs())
[docs] def get_kwargs(self): """ Get keyword arguments required to create a copy of this instance. Does not include the data array and annotation handler. """ kwargs = {} kwargs.update(self.get_repres_attrs()) kwargs.update(self.get_instance_attrs()) return kwargs
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = {'transform_log':self.transform_log} return attrs
[docs] def get_instance_attrs(self): """ Get instance attributes """ attrs = {'filename':self.filename, 'offset':self.offset, 'duration':self._duration, 'label':self.label} attrs.update(self.kwargs) return attrs
[docs] def get_data(self): """ Get underlying data. Returns: : numpy array Data array """ return self.data
[docs] def get_filename(self): """ Get filename. Returns: : string Filename """ return self.filename
[docs] def get_offset(self): """ Get offset. Returns: : float Offset """ return self.offset
[docs] def duration(self): """ Data array duration in seconds TODO: rename to get_duration() Returns: : float Duration in seconds """ return self._duration
[docs] def get_label(self, id=None): """ Get label. Returns: : int Label """ return self.label
[docs] def get_annotations(self): """ Get annotations. Returns: : pandas DataFrame Annotations """ if self.annot is None: return None else: return self.annot.get()
[docs] def deepcopy(self): """ Make a deep copy of the present instance See https://docs.python.org/2/library/copy.html Returns: : BaseAudio Deep copy. """ return copy.deepcopy(self)
[docs] def max(self, axis=0): """ Maximum data value along selected axis Args: axis: int Axis along which metric is computed Returns: : array-like Maximum value of the data array """ return np.max(self.data, axis=axis)
[docs] def min(self, axis=0): """ Minimum data value along selected axis Args: axis: int Axis along which metric is computed Returns: : array-like Minimum value of the data array """ return np.min(self.data, axis=axis)
[docs] def std(self, axis=0): """ Standard deviation along selected axis Args: axis: int Axis along which metric is computed Returns: : array-like Standard deviation of the data array """ return np.std(self.data, axis=axis)
[docs] def average(self, axis=0): """ Average value along selected axis Args: axis: int Axis along which metric is computed Returns: : array-like Average value of the data array """ return np.average(self.data, axis=axis)
[docs] def median(self, axis=0): """ Median value along selected axis Args: axis: int Axis along which metric is computed Returns: : array-like Median value of the data array """ return np.median(self.data, axis=axis)
[docs] def normalize(self, mean=0, std=1): """ Normalize the data array to specified mean and standard deviation. For the data array to be normalizable, it must have non-zero standard deviation. If this is not the case, the array is unchanged by calling this method. Args: mean: float Mean value of the normalized array. The default is 0. std: float Standard deviation of the normalized array. The default is 1. """ std_orig = np.std(self.data) if std_orig > 0: self.data = std * (self.data - np.mean(self.data)) / std_orig + mean self.transform_log.append({'name':'normalize', 'mean':mean, 'std':std})
[docs] def adjust_range(self, range=(0,1)): """ Applies a linear transformation to the data array that puts the values within the specified range. Args: range: tuple(float,float) Minimum and maximum value of the desired range. Default is (0,1) """ x_min = self.min() x_max = self.max() self.data = (range[1] - range[0]) * (self.data - x_min) / (x_max - x_min) + range[0] self.transform_log.append({'name':'adjust_range', 'range':range})
[docs] def view_allowed_transforms(self): """ View allowed transformations for this audio object. Returns: : list List of allowed transformations """ return list(self.allowed_transforms.keys())
[docs] def apply_transforms(self, transforms): """ Apply specified transforms to the audio object. Args: transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation and its arguments, if any. For example, {"name":"normalize", "mean":0.5, "std":1.0} Returns: None Example: >>> from ketos.audio.waveform import Waveform >>> # read audio signal from wav file >>> wf = Waveform.from_wav('ketos/tests/assets/grunt1.wav') >>> # print allowed transforms >>> wf.view_allowed_transforms() ['normalize', 'adjust_range', 'crop', 'add_gaussian_noise', 'bandpass_filter'] >>> # apply gaussian normalization followed by cropping >>> transforms = [{'name':'normalize','mean':0.5,'std':1.0},{'name':'crop','start':0.2,'end':0.7}] >>> wf.apply_transforms(transforms) >>> # inspect record of applied transforms >>> wf.transform_log [{'name': 'normalize', 'mean': 0.5, 'std': 1.0}, {'name': 'crop', 'start': 0.2, 'end': 0.7, 'length': None}] """ if transforms is None: return t = copy.deepcopy(transforms) for kwargs in t: name = kwargs.pop('name') if name in self.view_allowed_transforms(): self.allowed_transforms[name](**kwargs)
[docs] def annotate(self, **kwargs): """ Add an annotation or a collection of annotations. Input arguments are described in :meth:`ketos.audio.annotation.AnnotationHandler.add` """ if self.annot is None: self.annot = AnnotationHandler() #if the object does not have an annotation handler, create one! self.annot.add(**kwargs)
[docs] class BaseAudioTime(BaseAudio): """ Parent class for time-series audio classes such as :class:`audio.waveform.Waveform` and :class:`audio.spectrogram.Spectrogram`. Args: data: numpy array Data time_res: float Time resolution in seconds filename: str Filename of the original data file, if available (optional) offset: float Position within the original data file, in seconds measured from the start of the file. Defaults to 0 if not specified. label: int Spectrogram label. Optional annot: AnnotationHandler AnnotationHandler object. Optional transforms: list(dict) List of dictionaries, where each dictionary specifies the name of a transformation and its arguments, if any. For example, {"name":"normalize", "mean":0.5, "std":1.0} Attributes: data: numpy array Data ndim: int Dimensionality of data. time_ax: LinearAxis Axis object for the time dimension filename: str Filename of the original data file, if available (optional) offset: float Position within the original data file, in seconds measured from the start of the file. Defaults to 0 if not specified. label: int Data label. annot: AnnotationHandler or pandas DataFrame AnnotationHandler object. allowed_transforms: dict Transforms that can be applied via the apply_transform method transform_log: list List of transforms that have been applied to this object """ def __init__(self, data, time_res, filename='', offset=0, label=None, annot=None, transforms=None, transform_log=None, **kwargs): bins = max(1, data.shape[0]) length = data.shape[0] * time_res self.time_ax = LinearAxis(bins=bins, extent=(0., length), label='Time (s)') #initialize time axis super().__init__(data=data, filename=filename, offset=offset, duration=self.duration(), label=label, annot=annot, transforms=transforms, transform_log=transform_log, **kwargs) self.allowed_transforms.update({'crop': self.crop})
[docs] def get_repres_attrs(self): """ Get audio representation attributes """ attrs = super().get_repres_attrs() attrs.update({'time_res':self.time_res()}) return attrs
[docs] def get_instance_attrs(self): """ Get instance attributes """ attrs = super().get_instance_attrs() attrs.pop('duration', None) return attrs
[docs] def time_res(self): """ Get the time resolution. Returns: : float Time resolution in seconds """ return self.time_ax.bin_width()
[docs] def duration(self): """ Data array duration in seconds Returns: : float Duration in seconds """ return self.time_ax.max()
[docs] def label_array(self, label): """ Get an array indicating presence/absence (1/0) of the specified annotation label for each time bin. Args: label: int Label of interest. Returns: y: numpy.array Label array """ assert self.annot is not None, "An AnnotationHandler object is required for computing the label vector" y = np.zeros(self.time_ax.bins) ans = self.annot.get(label=label) for _,an in ans.iterrows(): b1 = self.time_ax.bin(an.start, truncate=True) b2 = self.time_ax.bin(an.end, truncate=True, closed_right=True) y[b1:b2+1] = 1 return y
[docs] def segment(self, window, step=None): """ Divide the time axis into segments of uniform length, which may or may not be overlapping. Window length and step size are converted to the nearest integer number of time steps. If necessary, the data array will be padded with zeros at the end to ensure that all segments have an equal number of samples. Args: window: float Length of each segment in seconds. step: float Step size in seconds. Returns: : list(BaseAudioTime) Stacked data segments """ return segment_data(self, window, step)
[docs] def crop(self, start=None, end=None, length=None, make_copy=False): """ Crop audio signal. Args: start: float Start time in seconds, measured from the left edge of spectrogram. end: float End time in seconds, measured from the left edge of spectrogram. length: int Horizontal size of the cropped image (number of pixels). If provided, the `end` argument is ignored. make_copy: bool Return a cropped copy of the spectrogra. Leaves the present instance unaffected. Default is False. Returns: a: BaseAudio Cropped data array """ if make_copy: d = self.deepcopy() else: d = self # crop axis b1, b2 = d.time_ax.cut(x_min=start, x_max=end, bins=length) # crop audio signal d.data = d.data[b1:b2+1] # crop annotations, if any if d.annot: d.annot.crop(start=start, end=end) d.offset += d.time_ax.low_edge(0) #update time offset d.time_ax.zero_offset() #shift time axis to start at t=0 if make_copy is False: self.transform_log.append({'name':'crop', 'start':start, 'end':end, 'length':length}) return d
[docs] def plot(self, figsize=(5,4), label_in_title=True, append_title=''): """ Plot the data with proper axes ranges and labels. Optionally, also display annotations as boxes superimposed on the data. Note: The resulting figure can be shown (fig.show()) or saved (fig.savefig(file_name)) Args: figsize: tuple Figure size label_in_title: bool Include label (if available) in figure title append_title: str Append this string to the title Returns: fig: matplotlib.figure.Figure A figure object. ax: matplotlib.axes.Axes Axes object """ # create canvas and axes fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize, sharex=True) # select the data array and attributes x = self.get_data() filename = self.get_filename() offset = self.get_offset() label = self.get_label() # axis labels ax.set_xlabel(self.time_ax.label) # title title = "" if filename is not None: title += "{0}".format(filename) if label is not None and label_in_title: if len(title) > 0: title += ", " title += "{0}".format(label) title += append_title plt.title(title) # if offset is non-zero, add a second time axis at the top # showing the `absolute` time if offset != 0: axt = ax.twiny() axt.set_xlim(offset, offset + self.duration()) #fig.tight_layout() return fig, ax