Source code for ketos.audio.audio_loader

# ================================================================================ #
#   Authors: Fabio Frazao and Oliver Kirsebom                                      #
#   Contact: fsfrazao@dal.ca, oliver.kirsebom@dal.ca                               #
#   Organization: MERIDIAN (https://meridian.cs.dal.ca/)                           #
#   Team: Data Analytics                                                           #
#   Project: ketos                                                                 #
#   Project goal: The ketos library provides functionalities for handling          #
#   and processing acoustic data and applying deep neural networks to sound        #
#   detection and classification tasks.                                            #
#                                                                                  #
#   License: GNU GPLv3                                                             #
#                                                                                  #
#       This program is free software: you can redistribute it and/or modify       #
#       it under the terms of the GNU General Public License as published by       #
#       the Free Software Foundation, either version 3 of the License, or          #
#       (at your option) any later version.                                        #
#                                                                                  #
#       This program is distributed in the hope that it will be useful,            #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#       GNU General Public License for more details.                               # 
#                                                                                  #
#       You should have received a copy of the GNU General Public License          #
#       along with this program.  If not, see <https://www.gnu.org/licenses/>.     #
# ================================================================================ #

""" 'audio.audio_loader' module within the ketos library

    This module contains the utilities for loading waveforms and computing spectrograms.

    The audio representations currently implemented in Ketos are: 

    - :class:`Waveform <ketos.audio.waveform.Waveform>`
    - :class:`magnitude spectrogram <ketos.audio.spectrogram.MagSpectrogram>`
    - :class:`power spectrogram <ketos.audio.spectrogram.PowSpectrogram>`
    - :class:`mel spectrogram <ketos.audio.spectrogram.MelSpectrogram>`
    - :class:`CQT spectrogram <ketos.audio.spectrogram.CQTSpectrogram>`
    - :class:`CQTSpectrogram <ketos.audio.spectrogram.CQTSpectrogram>`
    - :class:`GammatoneFilterBank <ketos.audio.gammatone.GammatoneFilterBank>`
    - :class:`AuralFeatures <ketos.audio.gammatone.AuralFeatures>`
"""
import os
from pathlib import Path
import pandas as pd
import numpy as np
import soundfile as sf
import tarfile
import warnings
import shutil
from ketos.audio.waveform import Waveform, get_duration
from ketos.data_handling.data_handling import find_wave_files
from ketos.data_handling.selection_table import query
from ketos.utils import floor_round_up, ceil_round_down, user_format_warning


class ArchiveManager():
    ''' Class for extracting files from a .tar file.

        Use the method :meth:`ketos.audio.audio_loader.ArchiveManager.extract` to extract one or 
        several files from the .tar file to a temporary directory.

        Every time a file extraction request is submitted, the contents of the temporary directory 
        are updated as follows: 

         * Requested files *not* already present in the directory are extracted.
         * Requested files already present in the directory are left untouched.
         * Files present in the directory that are not part of the request are removed.

        At any given time, the location of the temporary directory and the paths of the files stored 
        within the directory can be accessed via the attributes @extract_dir and @extracted_files.

        Args:
            tar_path: str
                Path to the .tar file
            extract_dir: str
                Path to the directory where the extracted files are temporarily stored. The directory 
                is automatically created. If a directory already exists at the specified path, all its 
                contents will be deleted. By default, audio files are extracted to the folder `kt-tmp` 
                within the current working directory.

        Attributes:
            tar: TarFile
                tar object
            tar_path: str
                Path to the .tar file
            extract_dir: str
                Path to the directory where the extracted files are temporarily stored
            extracted_files: list
                Relative paths to the currently extracted files
    '''
    def __init__(self, tar_path, extract_dir="kt-tmp"):
        self.tar_path = tar_path
        self.tar = tarfile.open(tar_path)
        self.extract_dir = extract_dir
        self.extracted_files = []
        self.close() 

    def _extract_files(self, paths):
        """ Helper function for extracting files.

            Issues a UserWarning if a file does not exist at the specified 
            path within the tar archive.

            Args:
                paths: list
                    Relative paths to the files to be extracted from within the tar archive
        """ 
        for path in paths:
            try:
                self.tar.extract(member=path, path=self.extract_dir)
                self.extracted_files.append(path)
            except KeyError as e:
                warnings.formatwarning = user_format_warning
                warnings.warn(f"{path} not found in {self.tar_path}")

    def _remove_files(self, paths):
        """ Helper function for removing files from the extraction directory.

            Args:
                paths: list
                    Relative paths to the files to be removed
        """ 
        for path in paths:
            dst = os.path.join(self.extract_dir, path)
            os.remove(dst)
            self.extracted_files.remove(path)

    def extract(self, paths):
        """ Update the files in the extraction directory.

            Every time this method is called, the contents of the temporary directory 
            are updated as follows: 

                * Requested files *not* already present in the directory are extracted.
                * Requested files already present in the directory are left untouched.
                * Files present in the directory that are not part of the request are removed.

            Args:
                paths: str or list
                    Relative path(s) of the files within the tar archive that we want to 
                    be available in the extraction directory
        """
        if isinstance(paths, str):
            paths = [paths]

        paths_extract = [path for path in paths if path not in self.extracted_files]
        paths_remove = [path for path in self.extracted_files if path not in paths]
        self._extract_files(paths_extract)
        self._remove_files(paths_remove)

    def close(self):
        """ Remove the extraction directory and its contents 
        """
        if os.path.exists(self.extract_dir):
            shutil.rmtree(self.extract_dir)

[docs]class SelectionGenerator(): """ Template class for selection generators. """ def __iter__(self): return self def __next__(self): """ Returns the next audio selection. An audio selection is represented as a dictionary with two required keys (data_dir, filename) and an unlimited number of optional keys, which typically include offset, duration, and label. Must be implemented in child class. Returns: : dict() Next audio selection """ pass
[docs] def num(self): """ Returns total number of selections. Must be implemented in child class. Returns: : int Total number of selections. """ pass
[docs] def reset(self): """ Resets the selection generator to the beginning. """ pass
[docs]class SelectionTableIterator(SelectionGenerator): """ Iterates over entries in a selection table. Args: data_dir: str Path to top folder containing audio files, or a .tar archive file. selection_table: pandas DataFrame Selection table include_attrs: bool If True, load data from all attribute columns in the selection table. Default is False. attrs: list(str) Specify the names of the attribute columns that you wish to load data from. Overwrites include_attrs if specified. If None, all columns will be loaded provided that include_attrs=True. extract_dir: str Temporary directory for storing audio files extracted from a tar archive file. Only relevant if @data_dir points to a .tar file. The directory will be automatically created. If a directory already exists at the specified path, all its contents will be deleted. By default, audio files are extracted to the folder `kt-tmp` within the current working directory. Note that this folder must be deleted manually when it is no longer needed. """ def __init__(self, data_dir, selection_table, include_attrs=False, attrs=None, extract_dir="kt-tmp"): self.sel = selection_table if os.path.isfile(data_dir) and tarfile.is_tarfile(data_dir): self.tar = ArchiveManager(data_dir, extract_dir) self.dir = self.tar.extract_dir else: self.tar = None self.dir = data_dir self.counter = 0 all_attrs = list(self.sel.columns.values) for col in ['start', 'end', 'label']: if col in all_attrs: all_attrs.remove(col) if attrs is not None: for col in attrs: if col not in all_attrs: attrs.remove(col) self.attrs = attrs elif include_attrs: self.attrs = all_attrs else: self.attrs = [] # determine if the selection table has been formatted according to # the new ketos style (>=2.6.0) or the old style self._new_style = (self.sel.index.names[0] == "sel_id") if self._new_style: self.sel_ids = self.sel.index.get_level_values(0).unique() self.num_sel = len(self.sel_ids) else: self.num_sel = len(self.sel) def __next__(self): """ Returns the next audio selection. Returns: audio_sel: dict Audio selection """ audio_sel = self.get_selection(self.counter) if self.tar is not None: self.tar.extract(audio_sel['filename']) self.counter = (self.counter + 1) % self.num() #update selection counter return audio_sel
[docs] def num(self): """ Returns total number of selections. Returns: : int Total number of selections. """ return self.num_sel
[docs] def reset(self): """ Resets the selection generator to the beginning of the selection table. """ self.counter = 0 if self.tar is not None: self.tar.close()
[docs] def get_selection(self, n): """ Returns the n-th audio selection in the table. Args: n: int The index (0,1,2,...) of the desired selection. Returns: res: dict The selection """ res = {'data_dir': self.dir} if self._new_style: selection = self.sel.loc[self.sel_ids[n]] res['filename'] = selection.index.values else: selection = self.sel.iloc[n] res['filename'] = self.sel.index.values[n][0] # start time if 'start' in selection.keys(): offset = selection['start'] else: offset = 0 # duration if 'end' in selection.keys(): duration = selection['end'] - offset else: duration = None # pass offset and duration to return dict res['offset'] = offset if duration is not None: res['duration'] = duration # label if 'label' in self.sel.columns.values: res['label'] = selection['label'] # attribute columns for col in self.attrs: res[col] = selection[col] # for new style, convert pandas Series to numpy arrays if self._new_style: for key in res.keys(): if isinstance(res[key], pd.Series): res[key] = res[key].values if key in ["offset", "duration"]: res[key] = res[key].astype(float) #ensure float # OBS: for labels and attributes, use only the first entry for col in ["label"] + self.attrs: if col in res.keys() and np.ndim(res[col]) > 0: res[col] = res[col][0] return res
[docs]class FrameStepper(SelectionGenerator): """ Generates selections with uniform length 'duration', with successive selections displaced by a fixed amount 'step' (If 'step' is not specified, it is set equal to 'duration'.) Args: duration: float Selection length in seconds. step: float Separation between consecutive selections in seconds. If None, the step size equals the selection length. path: str Path to folder containing .wav files. If None is specified, the current directory will be used. filename: str or list(str) Relative path to a single .wav file or a list of .wav files. Optional. pad: bool If True (default), the last segment is allowed to extend beyond the endpoint of the audio file. """ def __init__(self, duration, step=None, path=None, filename=None, pad=True): self.duration = duration if step is None: self.step = duration else: self.step = step if path is None: path = os.getcwd() # get all wav files in the folder, including subfolders if filename is None: self.dir = path self.files = find_wave_files(path=path, return_path=True, search_subdirs=True) assert len(self.files) > 0, '{0} did not find any wave files in {1}'.format(self.__class__.__name__, path) else: if isinstance(filename, str): fullpath = os.path.join(path,filename) assert os.path.exists(fullpath), '{0} could not find {1}'.format(self.__class__.__name__, fullpath) self.dir = os.path.dirname(fullpath) self.files = [os.path.basename(fullpath)] else: assert isinstance(filename, list), 'filename must be str or list(str)' self.dir = path self.files = filename # get file durations self.file_durations = np.array(get_duration([os.path.join(self.dir, f) for f in self.files])) # discard any files with 0 second duration # self.files = np.array(self.files)[self.file_durations > 0].tolist() # self.file_durations = self.file_durations[self.file_durations > 0] # obtain file durations and compute number of frames for each file self.num_segs = np.maximum((self.file_durations - self.duration) / self.step + 1, 1) if pad: # self.num_segs = ceil_round_down(self.num_segs, decimals=6).astype(int) self.num_segs = np.ceil(np.around(self.num_segs, decimals=6)).astype(int) else: # self.num_segs = floor_round_up(self.num_segs, decimals=6).astype(int) self.num_segs = np.floor(np.around(self.num_segs, decimals=6)).astype(int) self.num_segs_tot = np.sum(self.num_segs) self.reset() def __next__(self): """ Returns the next audio selection. Returns: audio_sel: dict Audio selection """ audio_sel = {'data_dir':self.dir, 'filename': self.files[self.file_id], 'offset':self.time, 'duration':self.duration} self.time += self.step #increment time self.seg_id += 1 #increment segment ID if self.seg_id == self.num_segs[self.file_id]: self._next_file() #if this was the last segment, jump to the next file return audio_sel
[docs] def num(self): """ Returns total number of selections. Returns: : int Total number of selections. """ return self.num_segs_tot
def _next_file(self): """ Jump to next file. """ self.file_id = (self.file_id + 1) % len(self.files) #increment file ID self.seg_id = 0 #reset self.time = 0 #reset
[docs] def reset(self): """ Resets the selection generator to the beginning of the first file. """ self.file_id = -1 self._next_file()
[docs] def get_file_paths(self, fullpath=True): """ Get the paths to the audio files associated with this instance. Args: fullpath: bool Whether to return the full path (default) or only the filename. Returns: ans: list List of file paths """ if fullpath: ans = [os.path.join(self.dir, f) for f in self.files] else: ans = self.files return ans
[docs] def get_file_durations(self): """ Get the durations of the audio files associated with this instance. Returns: ans: list List of file durations in seconds """ return self.file_durations.tolist()
def _file_limits_warning(start, end, file_path, file_duration): """ Helper function for the AudioLoader class. Generates warning messages if the selection start or end time is outside of the audio file's limits. Args: start: float Selection start time with respect to beginning of file in seconds. end: float Selection end time with respect to beginning of file in seconds. file_path: str Full path of the audio file. file_duration: float Audio file length in seconds. """ # total length of selection if file_duration == 0: return len_tot = end - start # determine how much of the selection is outside the file len_outside = max(0, -start) + max(0, end - file_duration) warnings.formatwarning = user_format_warning # print warnings if selection end is zero or negative file_info = f"While processing {os.path.basename(file_path)}" if (end <= 0): warnings.warn(f"{file_info}: selection has negative end time ({end:.2f}s).") # print warnings if selection start is later than the file end time elif (start > file_duration): warnings.warn(f"{file_info}: selection start time exceeds file duration ({start:.2f}s).") #print a warning that the selection has 0 or negative length (end before start) elif (len_tot <= 0): warnings.warn(f"{file_info}: selection has negative duration ({start:.2f},{end:.2f}).") # print a warning that a fraction larger than 50% of the selection is outside the file elif (len_outside > 0.5 * len_tot): warnings.warn(f"{file_info}: over 50% of the selection falls outside the audio file ({start:.2f}s,{end:.2f}s).")
[docs]class AudioLoader(): """ Class for loading segments of audio data from .wav files. Several representations of the audio data are possible, including waveform, magnitude spectrogram, power spectrogram, mel spectrogram, and CQT spectrogram. Args: selection_gen: SelectionGenerator Selection generator channel: int For stereo recordings, this can be used to select which channel to read from annotations: pandas DataFrame Annotation table representation: class or list of classes Audio data representation. This is a class that must receive the raw audio data and will transform the data into the specified audio representation object. Classes available in ketos: * Waveform: (rate), (resample_method) * MagSpectrogram, PowerSpectrogram, MelSpectrogram: audio, window, step, (window_func), (rate), (resample_method) * CQTSpectrogram: audio, step, bins_per_oct, (freq_min), (freq_max), (window_func), (rate), (resample_method) It is also possible to specify multiple audio presentations as a list. representation_params: dict or list of dict Dictionary containing any required and optional arguments for the representation class. If more than one representation is given `representation_params` must be a list of the same length and in the same order. batch_size: int Load segments in batches rather than one at the time. stop: bool Raise StopIteration when all selections have been loaded. Default is True. Examples: Creating an AudioLoader to load selections: >>> from ketos.audio.audio_loader import AudioLoader, SelectionTableIterator >>> from ketos.data_handling.selection_table import use_multi_indexing >>> import pandas as pd >>> # Load the audio representation you want to pass >>> from ketos.audio.spectrogram import MagSpectrogram >>> # specify the audio representation >>> rep = {'window':0.2, 'step':0.02, 'window_func':'hamming'} >>> # Load selections >>> sel = pd.DataFrame({'filename':["2min.wav", "2min.wav"],'start':[0.10,0.12],'end':[0.46,0.42]}) >>> sel = use_multi_indexing(sel, 'sel_id') >>> # create a generator for iterating over all the selections >>> generator = SelectionTableIterator(data_dir="ketos/tests/assets/", selection_table=sel) >>> # Create a loader by passing the generator and the representation to the AudioLoader >>> loader = AudioLoader(selection_gen=generator, representation=MagSpectrogram, representation_params=rep) >>> # print number of segments >>> print(loader.num()) 2 >>> # load and plot the first selection >>> spec = next(loader) >>> >>> import matplotlib.pyplot as plt >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_loader_2min_0.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_loader_2min_0.png Creating an AudioLoader to load selections made from annotations: >>> from ketos.audio.audio_loader import AudioLoader, SelectionTableIterator >>> from ketos.data_handling.selection_table import standardize >>> import pandas as pd >>> # Load the audio representation you want to pass >>> from ketos.audio.spectrogram import MagSpectrogram >>> # specify the audio representation >>> rep = {'window':0.2, 'step':0.02, 'window_func':'hamming'} >>> # Load selections >>> annot = pd.DataFrame([{"filename":"2min.wav", "start":2.0, "end":3.0, "label":0}, ... {"filename":"2min.wav", "start":5.0, "end":6.0, "label":0}, ... {"filename":"2min.wav", "start":21.0, "end":22.0, "label":0}, ... {"filename":"2min.wav", "start":25.0, "end":27.0, "label":0}]) >>> annot_std = standardize(table=annot) >>> # create a generator for iterating over all the selections >>> generator = SelectionTableIterator(data_dir="ketos/tests/assets/", selection_table=annot_std) >>> # Create a loader by passing the generator and the representation to the AudioLoader >>> loader = AudioLoader(selection_gen=generator, representation=MagSpectrogram, representation_params=rep) >>> # print number of segments >>> print(loader.num()) 4 >>> # load and plot the first selection >>> spec = next(loader) >>> >>> import matplotlib.pyplot as plt >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_loader_2min_1.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_loader_2min_1.png For more examples see child class :class:`audio.audio_loader.AudioFrameLoader` """ def __init__(self, selection_gen, channel=0, annotations=None, representation=Waveform, representation_params=None, batch_size=1, stop=True, **kwargs): self.representation = representation self.representation_params = representation_params if not isinstance(self.representation, list): self.representation = [self.representation] self.representation_params = [self.representation_params] for i in range(len(self.representation)): if self.representation_params[i] == None: # If no parameters are given then create an empty dict (this will use the default params) self.representation_params[i] = {} self.channel = channel self.selection_gen = selection_gen self.annot = annotations # QUESTION: kwargs is carrying more optional arguments. such as compute phase... it feels very wrong. shouldnt the phase be another representation? or an arugment of the spectrogram class? self.kwargs = kwargs self.batch_size = batch_size self.stop = stop self.file_durations = dict() self.reset() def __iter__(self): return self def __next__(self): """ Load next audio segment or batch of audio segments. Depending on how the loader was initialized, the return value can either be an instance of :class:`BaseAudio <ketos.audio.base_audio.BaseAudio>` (or, more commonly, a instance of one of its derived classes such as the :class:`Waveform <ketos.audio.waveform.Waveform>` or :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>` classes), a list of such objects, or a nested listed of such objects. Some examples: * If the loader was initialized with the audio representation `representation=Waveform`, `representation_params=None` (default) and with `batch_size=1` (default), the return value will be a single instance of :class:`Waveform <ketos.audio.waveform.Waveform>`. * If the loader was initialized with the audio representation `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` and with `batch_size=1` (default), the return value will be a list of length 2, where the first entry holds an instance of :class:`Waveform <ketos.audio.waveform.Waveform>` and the second entry holds an instance of :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>`. * If the loader was initialized with the audio representation `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` and with `batch_size>1`, the return value will be a nested list with outer length equal to `batch_size` and inner length 2, corresponding to the number of audio representations. If the loader was initialized with `stop=True` this method will raise `StopIteration` when all the selections have been loaded. Returns: a: BaseAudio, list(BaseAudio), or list(list(BaseAudio)) Next segment or next batch of segments """ return self._next_batch(load=True)
[docs] def skip(self): """ Skip to the next audio segment or batch of audio segments without loading the current one. """ self._next_batch(load=False)
def _next_batch(self, load=True): """ Load next audio segment or batch of audio segments. Helper function for :meth:`__next()__` and :meth:`skip()`. Args: load: bool Whether to load the audio data. """ if self.counter == self.num(): if self.stop: raise StopIteration else: self.reset() a = [] for _ in range(self.batch_size): if self.counter < self.num(): selection = next(self.selection_gen) if load: a.append(self.load(**selection, **self.kwargs)) self.counter += 1 if load: if self.batch_size == 1: a = a[0] return a
[docs] def num(self): """ Returns total number of segments. Returns: : int Total number of segments. """ return self.selection_gen.num()
[docs] def load(self, data_dir, filename, offset=0, duration=None, label=None, **kwargs): """ Load audio segment for specified file and time. Args: data_dir: str Data directory filename: str Filename or relative path offset: float Start time of the segment in seconds, measured from the beginning of the file. duration: float Duration of segment in seconds. label: int Integer label Returns: seg: BaseAudio or list(BaseAudio) Audio segment """ # convert scalar args to arrays if np.ndim(filename) == 0: filename = [filename] offset = np.array([offset], dtype=float) if duration is None: duration = [None] else: duration = np.array([duration], dtype=float) path = [str(Path(data_dir, fname).resolve()) for fname in filename] id = filename[0] # issue warnings if selections extend beyond file limits for i in range(len(path)): p = path[i] file_duration = self.file_durations.get(p) # If file duration does not exist in the dict, get file duration and add it to the dict if file_duration == None: file_duration = get_duration(p)[0] self.file_durations[p] = file_duration start = offset[i] end = file_duration - start if duration[i] is None else start + duration[i] _file_limits_warning(start=start, end=end, file_path=p, file_duration=file_duration) # load audio segs = [] for i in range(len(self.representation)): # For each representation self.representation_params[i]['duration'] = duration # The duration for the representation is defined by each segment seg = self.representation[i].from_wav(path=path, channel=self.channel, offset=offset, id=id, **self.representation_params[i], **kwargs) # add label if label is not None: seg.label = label # add annotations if self.annot is not None: file_offset = np.concatenate([[0],np.cumsum(duration)[:-1]]) for j in range(len(filename)): q = query(self.annot, filename=filename[j], start=offset[j], end=offset[j]+duration[j]) if len(q) > 0: q['start'] += file_offset[j] - offset[j] q['end'] += file_offset[j] - offset[j] seg.annotate(df=q) # We can add the metadata information such as start, duration and filename of spectrogram directly to the object here. # The problem is the duration wich may be different than what the user set depending if the representation needs to add some extra seconds # filename is just the filename and not the full path... not sure if we should change this? what are the advantages of giving the filename over path seg.start = offset[0] seg.filename = id segs.append(seg) if len(segs) == 1: segs = segs[0] return segs
[docs] def reset(self): """ Resets the audio loader to the beginning. """ self.selection_gen.reset() self.counter = 0
[docs]class AudioFrameLoader(AudioLoader): """ Load audio segments by sliding a fixed-size frame across the recording. The frame size is specified with the 'duration' argument, while the 'step' argument may be used to specify the step size. (If 'step' is not specified, it is set equal to 'duration'.) Args: duration: float Segment duration in seconds. step: float Separation between consecutive segments in seconds. If None, the step size equals the segment duration. path: str Path to folder containing .wav files. If None is specified, the current directory will be used. filename: str or list(str) relative path to a single .wav file or a list of .wav files. Optional channel: int For stereo recordings, this can be used to select which channel to read from annotations: pandas DataFrame Annotation table representation: class or list of classes Audio data representation. This is a class that must receive the raw audio data and will transform the data into the specified audio representation object. It is also possible to specify multiple audio presentations as a list. These presentations must have the same duration. representation_params: dict or list of dict Dictionary containing any required and optional arguments for the representation class. If more than one representation is given `representation_params` must be a list of the same length and in the same order. batch_size: int Load segments in batches rather than one at the time. stop: bool Raise StopIteration if the iteration exceeds the number of available selections. Default is False. pad: bool If True (default), the last segment is allowed to extend beyond the endpoint of the audio file. Examples: >>> from ketos.audio.audio_loader import AudioFrameLoader >>> # Load the audio representation you want to pass >>> from ketos.audio.spectrogram import MagSpectrogram >>> # specify path to wav file >>> filename = 'ketos/tests/assets/2min.wav' >>> # check the duration of the audio file >>> from ketos.audio.waveform import get_duration >>> print(get_duration(filename)[0]) 120.832 >>> # specify the audio representation parameters >>> rep = {'window':0.2, 'step':0.02, 'window_func':'hamming', 'freq_max':1000.} >>> # create an object for loading 30-s long spectrogram segments, using a step size of 15 s (50% overlap) >>> loader = AudioFrameLoader(duration=30., step=15., filename=filename, representation=MagSpectrogram, representation_params=rep) >>> # print number of segments >>> print(loader.num()) 8 >>> # load and plot the first segment >>> spec = next(loader) >>> >>> import matplotlib.pyplot as plt >>> fig = spec.plot() >>> fig.savefig("ketos/tests/assets/tmp/spec_2min_0.png") >>> plt.close(fig) .. image:: ../../../ketos/tests/assets/tmp/spec_2min_0.png """ def __init__(self, duration, step=None, path=None, filename=None, channel=0, annotations=None, representation=Waveform, representation_params=None, batch_size=1, stop=True, pad=True): if batch_size > 1: print("Warning: batch_size > 1 results in different behaviour for ketos versions >= 2.4.2 than earlier \ versions. You may want to check out the AudioFrameEfficientLoader class.") super().__init__(selection_gen=FrameStepper(duration=duration, step=step, path=path, pad=pad, filename=filename), channel=channel, annotations=annotations, representation=representation, representation_params=representation_params, batch_size=batch_size, stop=stop)
[docs] def get_file_paths(self, fullpath=True): """ Get the paths to the audio files associated with this instance. Args: fullpath: bool Whether to return the full path (default) or only the filename. Returns: ans: list List of file paths """ return self.selection_gen.get_file_paths(fullpath=fullpath)
[docs] def get_file_durations(self): """ Get the durations of the audio files associated with this instance. Returns: ans: list List of file durations in seconds """ return self.selection_gen.get_file_durations()
class AudioFrameEfficientLoader(AudioFrameLoader): """ Load audio segments by sliding a fixed-size frame across the recording. AudioFrameEfficientLoader implements a more efficient approach to loading overlapping audio segments and converting them to spectrograms. Rather than loading and converting one frame at the time, the AudioFrameEfficientLoader loads a longer frame and converts it to a spectrogram which is split up into the desired shorter frames. Use the `num_frames` argument to specify how many frames are loaded into memory at a time. While the segments are loaded into memory in batches, they are by default returned one at a time. Use the `return_as_batch` argument to change this behaviour. Args: duration: float Segment duration in seconds. Can also be specified via the 'duration' item of the 'repres' dictionary. step: float Separation between consecutive segments in seconds. If None, the step size equals the segment duration. path: str Path to folder containing .wav files. If None is specified, the current directory will be used. filename: str or list(str) relative path to a single .wav file or a list of .wav files. Optional channel: int For stereo recordings, this can be used to select which channel to read from annotations: pandas DataFrame Annotation table. Optional. representation: class or list of classes Audio data representation. This is a class that must receive the raw audio data and will transform the data into the specified audio representation object. It is also possible to specify multiple audio presentations as a list. These presentations must have the same duration. representation_params: dict or list of dict Dictionary containing any required and optional arguments for the representation class. If more than one representation is given `representation_params` must be a list of the same length and in the same order. num_frames: int Load segments in batches of size `num_frames` rather than one at the time. Increasing `num_frames` can help reduce computational time. You can also specify `num_frames='file'` to load one wav file at the time. return_as_batch: bool Whether to return the segments individually or in batches of size `num_frames`. The default behaviour is to return the segments individually. """ def __init__(self, duration=None, step=None, path=None, filename=None, channel=0, annotations=None, representation=Waveform, representation_params=None, num_frames=12, return_as_batch=False): assert (isinstance(num_frames, int) and num_frames >= 1) or \ (isinstance(num_frames, str) and num_frames.lower() == 'file'), \ 'Argument `num_frames` must be a positive integer or have the string value `file`' super().__init__(duration=duration, step=step, path=path, filename=filename, channel=channel, annotations=annotations, representation=representation, representation_params=representation_params) self.return_as_batch = return_as_batch self.transforms_list = [] if isinstance(num_frames, int): self.max_batch_size = num_frames else: self.max_batch_size = np.inf audio_sel = next(self.selection_gen) self.offset = audio_sel['offset'] self.data_dir = audio_sel['data_dir'] self.filename = audio_sel['filename'] def __next__(self): """ Load the next audio segment or batch of audio segments. Depending on how the loader was initialized, the return value can either be an instance of :class:`BaseAudio <ketos.audio.base_audio.BaseAudio>` (or, more commonly, a instance of one of its derived classes such as the :class:`Waveform <ketos.audio.waveform.Waveform>` or :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>` classes), a list of such objects, or a nested listed of such objects. * If the loader was initialized with the audio representation `representation=Waveform`, `representation_params=None` (default) and with `return_as_batch=False` (default), the return value will be a single instance of :class:`Waveform <ketos.audio.waveform.Waveform>`. * If the loader was initialized with the audio representation `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` and with `return_as_batch=False` (default), the return value will be a list of length 2, where the first entry holds an instance of :class:`Waveform <ketos.audio.waveform.Waveform>` and the second entry holds an instance of :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>`. * If the loader was initialized with the audio representation `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` and with `return_as_batch=True`, the return value will be a nested list with outer length equal to `num_frames` and inner length 2, corresponding to the number of audio representations. Returns: : BaseAudio, list(BaseAudio), or list(list(BaseAudio)) Next segment or next batch of segments """ if self.return_as_batch: self.load_next_batch() return self.batch else: return self.next_in_batch() def next_in_batch(self): """ Load the next audio segment. Returns: a: BaseAudio or list(BaseAudio) Next audio segment """ if self.counter == 0 or self.counter >= len(self.batch): self.load_next_batch() a = self.batch[self.counter] self.counter += 1 return a def load_next_batch(self): """ Load the next batch of audio objects. """ self.batch_size = 0 self.counter = 0 offset = np.inf data_dir = self.data_dir filename = self.filename while data_dir == self.data_dir and filename == self.filename and offset > self.offset and self.batch_size < self.max_batch_size: self.batch_size += 1 audio_sel = next(self.selection_gen) offset = audio_sel['offset'] data_dir = audio_sel['data_dir'] filename = audio_sel['filename'] duration = self.selection_gen.duration + self.selection_gen.step * (self.batch_size - 1) # load the data without applying transforms self.batch = self.load(data_dir=self.data_dir, filename=self.filename, offset=self.offset, duration=duration, label=None) if not isinstance(self.batch, list): self.batch = [self.batch] # loop over the representations for i in range(len(self.representation)): transforms = self.representation_params[i]['transforms'] if 'transforms' in self.representation_params[i].keys() else [] self.transforms_list.append(transforms) # segment the data self.batch[i] = self.batch[i].segment(window=self.selection_gen.duration, step=self.selection_gen.step) # apply the transforms to each segment separately with warnings.catch_warnings(): warnings.simplefilter("ignore") for j in range(len(self.batch[i])): self.batch[i][j].apply_transforms(self.transforms_list[i]) if len(self.batch) == 1: self.batch = self.batch[0] self.offset = offset self.data_dir = data_dir self.filename = filename