Source code for ketos.audio.audio_loader

# ================================================================================ #
#   Authors: Fabio Frazao and Oliver Kirsebom                                      #
#   Contact: fsfrazao@dal.ca, oliver.kirsebom@dal.ca                               #
#   Organization: MERIDIAN (https://meridian.cs.dal.ca/)                           #
#   Team: Data Analytics                                                           #
#   Project: ketos                                                                 #
#   Project goal: The ketos library provides functionalities for handling          #
#   and processing acoustic data and applying deep neural networks to sound        #
#   detection and classification tasks.                                            #
#                                                                                  #
#   License: GNU GPLv3                                                             #
#                                                                                  #
#       This program is free software: you can redistribute it and/or modify       #
#       it under the terms of the GNU General Public License as published by       #
#       the Free Software Foundation, either version 3 of the License, or          #
#       (at your option) any later version.                                        #
#                                                                                  #
#       This program is distributed in the hope that it will be useful,            #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#       GNU General Public License for more details.                               # 
#                                                                                  #
#       You should have received a copy of the GNU General Public License          #
#       along with this program.  If not, see <https://www.gnu.org/licenses/>.     #
# ================================================================================ #

""" 'audio.audio_loader' module within the ketos library

    This module contains the utilities for loading waveforms and computing spectrograms.

    The audio representations currently implemented in Ketos are: 

    - :class:`Waveform <ketos.audio.waveform.Waveform>`
    - :class:`magnitude spectrogram <ketos.audio.spectrogram.MagSpectrogram>`
    - :class:`power spectrogram <ketos.audio.spectrogram.PowSpectrogram>`
    - :class:`mel spectrogram <ketos.audio.spectrogram.MelSpectrogram>`
    - :class:`CQT spectrogram <ketos.audio.spectrogram.CQTSpectrogram>`
    - :class:`CQTSpectrogram <ketos.audio.spectrogram.CQTSpectrogram>`
    - :class:`GammatoneFilterBank <ketos.audio.gammatone.GammatoneFilterBank>`
    - :class:`AuralFeatures <ketos.audio.gammatone.AuralFeatures>`
"""
import os
from pathlib import Path
import pandas as pd
import numpy as np
import soundfile as sf
import tarfile
import warnings
import shutil
from ketos.audio.waveform import Waveform, get_duration
from ketos.data_handling.data_handling import find_wave_files
from ketos.data_handling.selection_table import query
from ketos.utils import floor_round_up, ceil_round_down, user_format_warning


class ArchiveManager():
    ''' Class for extracting files from a .tar file.

        Use the method :meth:`ketos.audio.audio_loader.ArchiveManager.extract` to extract one or 
        several files from the .tar file to a temporary directory.

        Every time a file extraction request is submitted, the contents of the temporary directory 
        are updated as follows: 

         * Requested files *not* already present in the directory are extracted.
         * Requested files already present in the directory are left untouched.
         * Files present in the directory that are not part of the request are removed.

        At any given time, the location of the temporary directory and the paths of the files stored 
        within the directory can be accessed via the attributes @extract_dir and @extracted_files.

        Args:
            tar_path: str
                Path to the .tar file
            extract_dir: str
                Path to the directory where the extracted files are temporarily stored. The directory 
                is automatically created. If a directory already exists at the specified path, all its 
                contents will be deleted. By default, audio files are extracted to the folder `kt-tmp` 
                within the current working directory.

        Attributes:
            tar: TarFile
                tar object
            tar_path: str
                Path to the .tar file
            extract_dir: str
                Path to the directory where the extracted files are temporarily stored
            extracted_files: list
                Relative paths to the currently extracted files
    '''
    def __init__(self, tar_path, extract_dir="kt-tmp"):
        self.tar_path = tar_path
        self.tar = tarfile.open(tar_path)
        self.extract_dir = extract_dir
        self.extracted_files = []
        self.close() 

    def _extract_files(self, paths):
        """ Helper function for extracting files.

            Issues a UserWarning if a file does not exist at the specified 
            path within the tar archive.

            Args:
                paths: list
                    Relative paths to the files to be extracted from within the tar archive
        """ 
        for path in paths:
            try:
                self.tar.extract(member=path, path=self.extract_dir)
                self.extracted_files.append(path)
            except KeyError as e:
                warnings.formatwarning = user_format_warning
                warnings.warn(f"{path} not found in {self.tar_path}")

    def _remove_files(self, paths):
        """ Helper function for removing files from the extraction directory.

            Args:
                paths: list
                    Relative paths to the files to be removed
        """ 
        for path in paths:
            dst = os.path.join(self.extract_dir, path)
            os.remove(dst)
            self.extracted_files.remove(path)

    def extract(self, paths):
        """ Update the files in the extraction directory.

            Every time this method is called, the contents of the temporary directory 
            are updated as follows: 

                * Requested files *not* already present in the directory are extracted.
                * Requested files already present in the directory are left untouched.
                * Files present in the directory that are not part of the request are removed.

            Args:
                paths: str or list
                    Relative path(s) of the files within the tar archive that we want to 
                    be available in the extraction directory
        """
        if isinstance(paths, str):
            paths = [paths]

        paths_extract = [path for path in paths if path not in self.extracted_files]
        paths_remove = [path for path in self.extracted_files if path not in paths]
        self._extract_files(paths_extract)
        self._remove_files(paths_remove)

    def close(self):
        """ Remove the extraction directory and its contents 
        """
        if os.path.exists(self.extract_dir):
            shutil.rmtree(self.extract_dir)


[docs]
class SelectionGenerator():
    """ Template class for selection generators.
    """
    def __iter__(self):
        return self

    def __next__(self):
        """ Returns the next audio selection.

            An audio selection is represented as a dictionary 
            with two required keys (data_dir, filename) and 
            an unlimited number of optional keys, which typically 
            include offset, duration, and label.
        
            Must be implemented in child class.

            Returns:
                : dict()
                    Next audio selection
        """
        pass


[docs]
    def num(self):
        """ Returns total number of selections.
        
            Must be implemented in child class.

            Returns:
                : int
                    Total number of selections.
        """
        pass



[docs]
    def reset(self):
        """ Resets the selection generator to the beginning.
        """        
        pass


    


[docs]
class SelectionTableIterator(SelectionGenerator):
    """ Iterates over entries in a selection table.

        Args: 
            data_dir: str
                Path to top folder containing audio files, or a .tar archive file.
            selection_table: pandas DataFrame
                Selection table
            include_attrs: bool
                If True, load data from all attribute columns in the selection table. Default is False.
            attrs: list(str)
                Specify the names of the attribute columns that you wish to load data from. 
                Overwrites include_attrs if specified. If None, all columns will be loaded provided that 
                include_attrs=True.
            extract_dir: str
                Temporary directory for storing audio files extracted from a tar archive file. 
                Only relevant if @data_dir points to a .tar file. The directory will be automatically 
                created. If a directory already exists at the specified path, all its contents will be 
                deleted. By default, audio files are extracted to the folder `kt-tmp` within the current
                working directory. Note that this folder must be deleted manually when it is no longer needed.
    """
    def __init__(self, data_dir, selection_table, include_attrs=False, attrs=None, extract_dir="kt-tmp"):
        self.sel = selection_table

        if os.path.isfile(data_dir) and tarfile.is_tarfile(data_dir):
            self.tar = ArchiveManager(data_dir, extract_dir)
            self.dir = self.tar.extract_dir
        else:
            self.tar = None
            self.dir = data_dir

        self.counter = 0

        all_attrs = list(self.sel.columns.values)
        for col in ['start', 'end', 'label']: 
            if col in all_attrs: all_attrs.remove(col)

        if attrs is not None:
            for col in attrs: 
                if col not in all_attrs: attrs.remove(col)
            self.attrs = attrs
        elif include_attrs:
            self.attrs = all_attrs
        else:
            self.attrs = []

        # determine if the selection table has been formatted according to 
        # the new ketos style (>=2.6.0) or the old style 
        self._new_style = (self.sel.index.names[0] == "sel_id")

        if self._new_style:
            self.sel_ids = self.sel.index.get_level_values(0).unique()
            self.num_sel = len(self.sel_ids)
        else:
            self.num_sel = len(self.sel)

    def __next__(self):
        """ Returns the next audio selection.

            Returns:
                audio_sel: dict
                    Audio selection
        """
        audio_sel = self.get_selection(self.counter)

        if self.tar is not None:
            self.tar.extract(audio_sel['filename'])

        self.counter = (self.counter + 1) % self.num() #update selection counter
        return audio_sel


[docs]
    def num(self):
        """ Returns total number of selections.
        
            Returns:
                : int
                    Total number of selections.
        """
        return self.num_sel



[docs]
    def reset(self):
        """ Resets the selection generator to the beginning of the selection table.
        """        
        self.counter = 0
        if self.tar is not None:
            self.tar.close()

        

[docs]
    def get_selection(self, n):
        """ Returns the n-th audio selection in the table.

            Args:
                n: int
                    The index (0,1,2,...) of the desired selection.

            Returns:
                res: dict
                    The selection
        """
        res = {'data_dir': self.dir}

        if self._new_style:
            selection = self.sel.loc[self.sel_ids[n]]
            res['filename'] = selection.index.values            
        
        else:
            selection = self.sel.iloc[n]
            res['filename'] = self.sel.index.values[n][0]
        
        # start time
        if 'start' in selection.keys(): 
            offset = selection['start']
        else: 
            offset = 0

        # duration
        if 'end' in selection.keys(): 
            duration = selection['end'] - offset
        else:
            duration = None

        # pass offset and duration to return dict
        res['offset'] = offset
        if duration is not None:
            res['duration'] = duration

        # label
        if 'label' in self.sel.columns.values: 
            res['label'] = selection['label']

        # attribute columns
        for col in self.attrs: 
            res[col] = selection[col]

        # for new style, convert pandas Series to numpy arrays
        if self._new_style:
            for key in res.keys():
                if isinstance(res[key], pd.Series):
                    res[key] = res[key].values
                    if key in ["offset", "duration"]:
                        res[key] = res[key].astype(float) #ensure float

        # OBS: for labels and attributes, use only the first entry
        for col in ["label"] + self.attrs:
            if col in res.keys() and np.ndim(res[col]) > 0:
                res[col] = res[col][0]
        
        return res





[docs]
class FrameStepper(SelectionGenerator):
    """ Generates selections with uniform length 'duration', with successive selections 
        displaced by a fixed amount 'step' (If 'step' is not specified, it is set equal 
        to 'duration'.)

        Args: 
            duration: float
                Selection length in seconds.
            step: float
                Separation between consecutive selections in seconds. If None, the step size 
                equals the selection length.
            path: str
                Path to folder containing .wav files. If None is specified, the current directory will be used.
            filename: str or list(str)
                Relative path to a single .wav file or a list of .wav files. Optional.
            pad: bool
                If True (default), the last segment is allowed to extend beyond the endpoint of the audio file.
    """
    def __init__(self, duration, step=None, path=None, filename=None, pad=True):            
        self.duration = duration
        if step is None: self.step = duration
        else: self.step = step

        if path is None: path = os.getcwd()

        # get all wav files in the folder, including subfolders
        if filename is None:
            self.dir = path
            self.files = find_wave_files(path=path, return_path=True, search_subdirs=True)
            assert len(self.files) > 0, '{0} did not find any wave files in {1}'.format(self.__class__.__name__, path)

        else:
            if isinstance(filename, str):
                fullpath = os.path.join(path,filename)
                assert os.path.exists(fullpath), '{0} could not find {1}'.format(self.__class__.__name__, fullpath)
                self.dir = os.path.dirname(fullpath)
                self.files = [os.path.basename(fullpath)]
            else:                
                assert isinstance(filename, list), 'filename must be str or list(str)'        
                self.dir = path
                self.files = filename

        # get file durations
        self.file_durations = np.array(get_duration([os.path.join(self.dir, f) for f in self.files]))

        # discard any files with 0 second duration
        # self.files = np.array(self.files)[self.file_durations > 0].tolist()
        # self.file_durations = self.file_durations[self.file_durations > 0]

        # obtain file durations and compute number of frames for each file
        self.num_segs = np.maximum((self.file_durations - self.duration) / self.step + 1, 1)
        if pad:
            # self.num_segs = ceil_round_down(self.num_segs, decimals=6).astype(int)   
            self.num_segs = np.ceil(np.around(self.num_segs, decimals=6)).astype(int)     
        else:
            # self.num_segs = floor_round_up(self.num_segs, decimals=6).astype(int)     
            self.num_segs = np.floor(np.around(self.num_segs, decimals=6)).astype(int)

        self.num_segs_tot = np.sum(self.num_segs)

        self.reset()

    def __next__(self):
        """ Returns the next audio selection.
        
            Returns:
                audio_sel: dict
                    Audio selection
        """
        audio_sel = {'data_dir':self.dir, 'filename': self.files[self.file_id], 'offset':self.time, 'duration':self.duration}
        self.time += self.step #increment time       
        self.seg_id += 1 #increment segment ID
        if self.seg_id == self.num_segs[self.file_id]: self._next_file() #if this was the last segment, jump to the next file
        return audio_sel


[docs]
    def num(self):
        """ Returns total number of selections.
        
            Returns:
                : int
                    Total number of selections.
        """
        return self.num_segs_tot


    def _next_file(self):
        """ Jump to next file. 
        """
        self.file_id = (self.file_id + 1) % len(self.files) #increment file ID
        self.seg_id = 0 #reset
        self.time = 0 #reset


[docs]
    def reset(self):
        """ Resets the selection generator to the beginning of the first file.
        """        
        self.file_id = -1
        self._next_file()



[docs]
    def get_file_paths(self, fullpath=True):
        """ Get the paths to the audio files associated with this instance.

            Args:
                fullpath: bool
                    Whether to return the full path (default) or only the filename.

            Returns:
                ans: list
                    List of file paths
        """
        if fullpath:
            ans = [os.path.join(self.dir, f) for f in self.files]
        else:
            ans = self.files

        return ans



[docs]
    def get_file_durations(self):
        """ Get the durations of the audio files associated with this instance.

            Returns:
                ans: list
                    List of file durations in seconds
        """
        return self.file_durations.tolist()




def _file_limits_warning(start, end, file_path, file_duration):
    """ Helper function for the AudioLoader class.
    
        Generates warning messages if the selection start or end time is outside of the audio file's limits.

        Args:
            start: float
                Selection start time with respect to beginning of file in seconds.
            end: float
                Selection end time with respect to beginning of file in seconds.
            file_path: str
                Full path of the audio file.
            file_duration: float
                Audio file length in seconds.
    """
    # total length of selection
    if file_duration == 0:
        return
    len_tot = end - start
    # determine how much of the selection is outside the file
    len_outside = max(0, -start) + max(0, end - file_duration)

    warnings.formatwarning = user_format_warning

    # print warnings if selection end is zero or negative
    file_info = f"While processing {os.path.basename(file_path)}"
    if (end <= 0):
        warnings.warn(f"{file_info}: selection has negative end time ({end:.2f}s).")

    # print warnings if selection start is later than the file end time
    elif (start > file_duration):
        warnings.warn(f"{file_info}: selection start time exceeds file duration ({start:.2f}s).")

    #print a warning that the selection has 0 or negative length (end before start)
    elif (len_tot <= 0):
        warnings.warn(f"{file_info}: selection has negative duration ({start:.2f},{end:.2f}).")
         
    # print a warning that a fraction larger than 50% of the selection is outside the file
    elif (len_outside > 0.5 * len_tot):
        warnings.warn(f"{file_info}: over 50% of the selection falls outside the audio file ({start:.2f}s,{end:.2f}s).")



[docs]
class AudioLoader():
    """ Class for loading segments of audio data from .wav files. 

        Several representations of the audio data are possible, including 
        waveform, magnitude spectrogram, power spectrogram, mel spectrogram, 
        and CQT spectrogram.

        Args:
            selection_gen: SelectionGenerator
                Selection generator
            channel: int
                For stereo recordings, this can be used to select which channel to read from
            annotations: pandas DataFrame
                Annotation table
            representation: class or list of classes
                Audio data representation. This is a class that must receive the raw audio data and will transform the data 
                into the specified audio representation object.
                
                Classes available in ketos:

                    * Waveform: 
                        (rate), (resample_method)
                    
                    * MagSpectrogram, PowerSpectrogram, MelSpectrogram: 
                        audio, window, step, (window_func), (rate), (resample_method)
                    
                    * CQTSpectrogram:
                        audio, step, bins_per_oct, (freq_min), (freq_max), (window_func), (rate), (resample_method)

                It is also possible to specify multiple audio presentations as a list.
            representation_params: dict or list of dict
                Dictionary containing any required and optional arguments for the representation class. If more than one
                representation is given `representation_params` must be a list of the same length and in the same order.
            batch_size: int
                Load segments in batches rather than one at the time. 
            stop: bool
                Raise StopIteration when all selections have been loaded. Default is True.
        
        Examples:

            Creating an AudioLoader to load selections:         
            
            >>> from ketos.audio.audio_loader import AudioLoader, SelectionTableIterator
            >>> from ketos.data_handling.selection_table import use_multi_indexing
            >>> import pandas as pd
            >>> # Load the audio representation you want to pass
            >>> from ketos.audio.spectrogram import MagSpectrogram
            >>> # specify the audio representation
            >>> rep = {'window':0.2, 'step':0.02, 'window_func':'hamming'}
            >>> # Load selections
            >>> sel = pd.DataFrame({'filename':["2min.wav", "2min.wav"],'start':[0.10,0.12],'end':[0.46,0.42]})
            >>> sel = use_multi_indexing(sel, 'sel_id')
            >>> # create a generator for iterating over all the selections 
            >>> generator = SelectionTableIterator(data_dir="ketos/tests/assets/", selection_table=sel)
            >>> # Create a loader by passing the generator and the representation to the AudioLoader
            >>> loader = AudioLoader(selection_gen=generator, representation=MagSpectrogram, representation_params=rep)
            >>> # print number of segments
            >>> print(loader.num())
            2
            >>> # load and plot the first selection
            >>> spec = next(loader)
            >>>
            >>> import matplotlib.pyplot as plt
            >>> fig = spec.plot()
            >>> fig.savefig("ketos/tests/assets/tmp/spec_loader_2min_0.png")
            >>> plt.close(fig)
            
            .. image:: ../../../ketos/tests/assets/tmp/spec_loader_2min_0.png

            Creating an AudioLoader to load selections made from annotations:   

            >>> from ketos.audio.audio_loader import AudioLoader, SelectionTableIterator
            >>> from ketos.data_handling.selection_table import standardize
            >>> import pandas as pd
            >>> # Load the audio representation you want to pass
            >>> from ketos.audio.spectrogram import MagSpectrogram
            >>> # specify the audio representation
            >>> rep = {'window':0.2, 'step':0.02, 'window_func':'hamming'}
            >>> # Load selections
            >>> annot = pd.DataFrame([{"filename":"2min.wav", "start":2.0, "end":3.0, "label":0},
            ...         {"filename":"2min.wav", "start":5.0, "end":6.0, "label":0},
            ...         {"filename":"2min.wav", "start":21.0, "end":22.0, "label":0},
            ...         {"filename":"2min.wav", "start":25.0, "end":27.0, "label":0}])
            >>> annot_std = standardize(table=annot)
            >>> # create a generator for iterating over all the selections 
            >>> generator = SelectionTableIterator(data_dir="ketos/tests/assets/", selection_table=annot_std)
            >>> # Create a loader by passing the generator and the representation to the AudioLoader
            >>> loader = AudioLoader(selection_gen=generator, representation=MagSpectrogram, representation_params=rep)
            >>> # print number of segments
            >>> print(loader.num())
            4
            >>> # load and plot the first selection
            >>> spec = next(loader)
            >>>
            >>> import matplotlib.pyplot as plt
            >>> fig = spec.plot()
            >>> fig.savefig("ketos/tests/assets/tmp/spec_loader_2min_1.png")
            >>> plt.close(fig)
            
            .. image:: ../../../ketos/tests/assets/tmp/spec_loader_2min_1.png

            For more examples see child class :class:`audio.audio_loader.AudioFrameLoader`   

    """
    def __init__(self, selection_gen, channel=0, annotations=None, representation=Waveform, representation_params=None, 
                        batch_size=1, stop=True, **kwargs):

        self.representation = representation
        self.representation_params = representation_params

        if not isinstance(self.representation, list):
            self.representation = [self.representation]
            self.representation_params = [self.representation_params]
        
        for i in range(len(self.representation)):
            if self.representation_params[i] == None: # If no parameters are given then create an empty dict (this will use the default params)
                self.representation_params[i] = {}
        self.channel = channel
        self.selection_gen = selection_gen
        self.annot = annotations
        # QUESTION: kwargs is carrying more optional arguments. such as compute phase... it feels very wrong. shouldnt the phase be another representation? or an arugment of the spectrogram class?
        self.kwargs = kwargs
        self.batch_size = batch_size
        self.stop = stop
        self.file_durations = dict()
        self.reset()

    def __iter__(self):
        return self

    def __next__(self):
        """ Load next audio segment or batch of audio segments.

            Depending on how the loader was initialized, the return value can either be 
            an instance of :class:`BaseAudio <ketos.audio.base_audio.BaseAudio>` (or, 
            more commonly, a instance of one of its derived classes such as the 
            :class:`Waveform <ketos.audio.waveform.Waveform>` or 
            :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>`
            classes), a list of such objects, or a nested listed of such objects. 

            Some examples:

             * If the loader was initialized with the audio representation `representation=Waveform`,   
               `representation_params=None` (default) and with `batch_size=1` (default), the return  
               value will be a single instance of :class:`Waveform <ketos.audio.waveform.Waveform>`.

             * If the loader was initialized with the audio representation 
               `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` 
               and with `batch_size=1` (default), the return value will be a list 
               of length 2, where the first entry holds an instance of 
               :class:`Waveform <ketos.audio.waveform.Waveform>` and the second entry holds an instance 
               of :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>`.

             * If the loader was initialized with the audio representation 
               `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` 
               and with `batch_size>1`, the return value will be a nested list with outer 
               length equal to `batch_size` and inner length 2, corresponding to the number of 
               audio representations.

            If the loader was initialized with `stop=True` this method will raise `StopIteration` 
            when all the selections have been loaded.

            Returns: 
                a: BaseAudio, list(BaseAudio), or list(list(BaseAudio))
                    Next segment or next batch of segments
        """
        return self._next_batch(load=True)

            

[docs]
    def skip(self):
        """ Skip to the next audio segment or batch of audio segments
            without loading the current one.
        """        
        self._next_batch(load=False)


    def _next_batch(self, load=True):
        """ Load next audio segment or batch of audio segments.

            Helper function for :meth:`__next()__` and :meth:`skip()`.

            Args:
                load: bool
                    Whether to load the audio data.
        """
        if self.counter == self.num():
            if self.stop:
                raise StopIteration
            else:
                self.reset()

        a = []
        for _ in range(self.batch_size):
            if self.counter < self.num():
                selection = next(self.selection_gen)
                if load:
                    a.append(self.load(**selection, **self.kwargs))
                self.counter += 1

        if load:
            if self.batch_size == 1: a = a[0]
            return a


[docs]
    def num(self):
        """ Returns total number of segments.
        
            Returns:
                : int
                    Total number of segments.
        """
        return self.selection_gen.num()



[docs]
    def load(self, data_dir, filename, offset=0, duration=None, label=None, **kwargs):
        """ Load audio segment for specified file and time.

            Args:
                data_dir: str
                    Data directory
                filename: str
                    Filename or relative path
                offset: float
                    Start time of the segment in seconds, measured from the 
                    beginning of the file.
                duration: float
                    Duration of segment in seconds.
                label: int
                    Integer label
        
            Returns: 
                seg: BaseAudio or list(BaseAudio)
                    Audio segment
        """
        # convert scalar args to arrays
        if np.ndim(filename) == 0:
            filename = [filename]
            offset = np.array([offset], dtype=float)
            if duration is None:
                duration = [None]
            else:
                duration = np.array([duration], dtype=float)

        path = [str(Path(data_dir, fname).resolve()) for fname in filename]
        id = filename[0]

        # issue warnings if selections extend beyond file limits
        for i in range(len(path)):  
            p = path[i]          
            file_duration = self.file_durations.get(p)
            # If file duration does not exist in the dict, get file duration and add it to the dict
            if file_duration == None:
                file_duration = get_duration(p)[0]
                self.file_durations[p] = file_duration

            start = offset[i]
            end = file_duration - start if duration[i] is None else start + duration[i]
            _file_limits_warning(start=start, end=end, file_path=p, file_duration=file_duration)


        # load audio
        segs = []
        for i in range(len(self.representation)): # For each representation
            self.representation_params[i]['duration'] = duration # The duration for the representation is defined by each segment
            seg = self.representation[i].from_wav(path=path, channel=self.channel, offset=offset, id=id, **self.representation_params[i], **kwargs) 

            # add label
            if label is not None:
                seg.label = label

            # add annotations
            if self.annot is not None:
                file_offset = np.concatenate([[0],np.cumsum(duration)[:-1]])
                for j in range(len(filename)):
                    q = query(self.annot, filename=filename[j], start=offset[j], end=offset[j]+duration[j])
                    if len(q) > 0:
                        q['start'] += file_offset[j] - offset[j]
                        q['end']   += file_offset[j] - offset[j]
                        seg.annotate(df=q)  
            
            # We can add the metadata information such as start, duration and filename of spectrogram directly to the object here.
            # The problem is the duration wich may be different than what the user set depending if the representation needs to add some extra seconds
            # filename is just the filename and not the full path... not sure if we should change this? what are the advantages of giving the filename over path
            seg.start = offset[0]
            seg.filename = id
            segs.append(seg)

        if len(segs) == 1: segs = segs[0]

        return segs



[docs]
    def reset(self):
        """ Resets the audio loader to the beginning.
        """        
        self.selection_gen.reset()
        self.counter = 0





[docs]
class AudioFrameLoader(AudioLoader):
    """ Load audio segments by sliding a fixed-size frame across the recording.

        The frame size is specified with the 'duration' argument, while the 'step'
        argument may be used to specify the step size. (If 'step' is not specified, 
        it is set equal to 'duration'.)

        Args:
            duration: float
                Segment duration in seconds. 
            step: float
                Separation between consecutive segments in seconds. If None, the step size 
                equals the segment duration. 
            path: str
                Path to folder containing .wav files. If None is specified, the current directory will be used.
            filename: str or list(str)
                relative path to a single .wav file or a list of .wav files. Optional
            channel: int
                For stereo recordings, this can be used to select which channel to read from
            annotations: pandas DataFrame
                Annotation table
            representation: class or list of classes
                Audio data representation. This is a class that must receive the raw audio data 
                and will transform the data into the specified audio representation object.  
                It is also possible to specify multiple audio presentations as a list. These 
                presentations must have the same duration.
            representation_params: dict or list of dict
                Dictionary containing any required and optional arguments for the representation class. If more than one
                representation is given `representation_params` must be a list of the same length and in the same order.
            batch_size: int
                Load segments in batches rather than one at the time. 
            stop: bool
                Raise StopIteration if the iteration exceeds the number of available selections. Default is False.
            pad: bool
                If True (default), the last segment is allowed to extend beyond the endpoint of the audio file.

        Examples:
            >>> from ketos.audio.audio_loader import AudioFrameLoader
            >>> # Load the audio representation you want to pass
            >>> from ketos.audio.spectrogram import MagSpectrogram
            >>> # specify path to wav file
            >>> filename = 'ketos/tests/assets/2min.wav'
            >>> # check the duration of the audio file
            >>> from ketos.audio.waveform import get_duration
            >>> print(get_duration(filename)[0])
            120.832
            >>> # specify the audio representation parameters
            >>> rep = {'window':0.2, 'step':0.02, 'window_func':'hamming', 'freq_max':1000.}
            >>> # create an object for loading 30-s long spectrogram segments, using a step size of 15 s (50% overlap) 
            >>> loader = AudioFrameLoader(duration=30., step=15., filename=filename, representation=MagSpectrogram, representation_params=rep)
            >>> # print number of segments
            >>> print(loader.num())
            8
            >>> # load and plot the first segment
            >>> spec = next(loader)
            >>>
            >>> import matplotlib.pyplot as plt
            >>> fig = spec.plot()
            >>> fig.savefig("ketos/tests/assets/tmp/spec_2min_0.png")
            >>> plt.close(fig)
            
            .. image:: ../../../ketos/tests/assets/tmp/spec_2min_0.png
    """
    def __init__(self, duration, step=None, path=None, filename=None, channel=0, 
                    annotations=None, representation=Waveform, representation_params=None, batch_size=1, 
                    stop=True, pad=True):

        if batch_size > 1:
            print("Warning: batch_size > 1 results in different behaviour for ketos versions >= 2.4.2 than earlier \
                   versions. You may want to check out the AudioFrameEfficientLoader class.")

        super().__init__(selection_gen=FrameStepper(duration=duration, step=step, path=path, pad=pad, filename=filename), 
            channel=channel, annotations=annotations, representation=representation, representation_params=representation_params, 
            batch_size=batch_size, stop=stop)


[docs]
    def get_file_paths(self, fullpath=True):
        """ Get the paths to the audio files associated with this instance.

            Args:
                fullpath: bool
                    Whether to return the full path (default) or only the filename.

            Returns:
                ans: list
                    List of file paths
        """
        return self.selection_gen.get_file_paths(fullpath=fullpath)



[docs]
    def get_file_durations(self):
        """ Get the durations of the audio files associated with this instance.

            Returns:
                ans: list
                    List of file durations in seconds
        """
        return self.selection_gen.get_file_durations()




class AudioFrameEfficientLoader(AudioFrameLoader):
    """ Load audio segments by sliding a fixed-size frame across the recording.

        AudioFrameEfficientLoader implements a more efficient approach to loading 
        overlapping audio segments and converting them to spectrograms. 
        Rather than loading and converting one frame at the time, the 
        AudioFrameEfficientLoader loads a longer frame and converts it to a 
        spectrogram which is split up into the desired shorter frames.

        Use the `num_frames` argument to specify how many frames are loaded into 
        memory at a time.

        While the segments are loaded into memory in batches, they are by default 
        returned one at a time. Use the `return_as_batch` argument to change this
        behaviour.

        Args:
            duration: float
                Segment duration in seconds. Can also be specified via the 'duration' 
                item of the 'repres' dictionary.
            step: float
                Separation between consecutive segments in seconds. If None, the step size 
                equals the segment duration. 
            path: str
                Path to folder containing .wav files. If None is specified, the current directory will be used.
            filename: str or list(str)
                relative path to a single .wav file or a list of .wav files. Optional
            channel: int
                For stereo recordings, this can be used to select which channel to read from
            annotations: pandas DataFrame
                Annotation table. Optional.
            representation: class or list of classes
                Audio data representation. This is a class that must receive the raw audio data 
                and will transform the data into the specified audio representation object.  
                It is also possible to specify multiple audio presentations as a list. These 
                presentations must have the same duration.
            representation_params: dict or list of dict
                Dictionary containing any required and optional arguments for the representation class. If more than one
                representation is given `representation_params` must be a list of the same length and in the same order.
            num_frames: int
                Load segments in batches of size `num_frames` rather than one at the time. 
                Increasing `num_frames` can help reduce computational time.
                You can also specify `num_frames='file'` to load one wav file at the time.
            return_as_batch: bool
                Whether to return the segments individually or in batches of size `num_frames`.
                The default behaviour is to return the segments individually.
    """
    def __init__(self, duration=None, step=None, path=None, filename=None, channel=0, 
                    annotations=None, representation=Waveform, representation_params=None,
                    num_frames=12, return_as_batch=False):

        assert (isinstance(num_frames, int) and num_frames >= 1) or \
            (isinstance(num_frames, str) and num_frames.lower() == 'file'), \
            'Argument `num_frames` must be a positive integer or have the string value `file`'

        super().__init__(duration=duration, step=step, path=path, filename=filename, 
                    channel=channel, annotations=annotations, representation=representation, 
                    representation_params=representation_params)

        self.return_as_batch = return_as_batch
        self.transforms_list = []
        
        if isinstance(num_frames, int):
            self.max_batch_size = num_frames
        else:
            self.max_batch_size = np.inf

        audio_sel = next(self.selection_gen)
        self.offset = audio_sel['offset']
        self.data_dir = audio_sel['data_dir']
        self.filename = audio_sel['filename']

    def __next__(self):
        """ Load the next audio segment or batch of audio segments.

            Depending on how the loader was initialized, the return value can either be 
            an instance of :class:`BaseAudio <ketos.audio.base_audio.BaseAudio>` (or, 
            more commonly, a instance of one of its derived classes such as the 
            :class:`Waveform <ketos.audio.waveform.Waveform>` or 
            :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>`
            classes), a list of such objects, or a nested listed of such objects. 

             * If the loader was initialized with the audio representation `representation=Waveform`,   
               `representation_params=None` (default) and with `return_as_batch=False` (default), 
               the return value will be a single instance of :class:`Waveform <ketos.audio.waveform.Waveform>`.

             * If the loader was initialized with the audio representation 
               `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` 
               and with `return_as_batch=False` (default), the return value will be a list 
               of length 2, where the first entry holds an instance of 
               :class:`Waveform <ketos.audio.waveform.Waveform>` and the second entry holds an instance 
               of :class:`MagSpectrogram <ketos.audio.spectrogram.MagSpectrogram>`.

             * If the loader was initialized with the audio representation 
               `representation=[Waveform, MagSpectrogram]`, `representation_params=[None, {'window':0.1,'step':0.02}]` 
               and with `return_as_batch=True`, the return value will be a nested list with outer 
               length equal to `num_frames` and inner length 2, corresponding to the number of 
               audio representations.

            Returns: 
                : BaseAudio, list(BaseAudio), or list(list(BaseAudio))
                    Next segment or next batch of segments
        """
        if self.return_as_batch:
            self.load_next_batch()
            return self.batch
        else:           
            return self.next_in_batch()

    def next_in_batch(self):
        """ Load the next audio segment.
        
            Returns: 
                a: BaseAudio or list(BaseAudio)
                    Next audio segment
        """
        if self.counter == 0 or self.counter >= len(self.batch): 
            self.load_next_batch()
        
        a = self.batch[self.counter]
        self.counter += 1
        return a

    def load_next_batch(self):
        """ Load the next batch of audio objects.
        """
        self.batch_size = 0
        self.counter = 0
        offset = np.inf
        data_dir = self.data_dir
        filename = self.filename
        while data_dir == self.data_dir and filename == self.filename and offset > self.offset and self.batch_size < self.max_batch_size:
            self.batch_size += 1
            audio_sel = next(self.selection_gen)
            offset = audio_sel['offset']
            data_dir = audio_sel['data_dir']
            filename = audio_sel['filename']            

        duration = self.selection_gen.duration + self.selection_gen.step * (self.batch_size - 1)

        # load the data without applying transforms
        self.batch = self.load(data_dir=self.data_dir, filename=self.filename, offset=self.offset, 
            duration=duration, label=None)

        if not isinstance(self.batch, list): self.batch = [self.batch]

        # loop over the representations
        for i in range(len(self.representation)):
            transforms = self.representation_params[i]['transforms'] if 'transforms' in self.representation_params[i].keys() else []
            self.transforms_list.append(transforms)
            # segment the data
            self.batch[i] = self.batch[i].segment(window=self.selection_gen.duration, step=self.selection_gen.step)

            # apply the transforms to each segment separately 
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")        
                for j in range(len(self.batch[i])):
                    self.batch[i][j].apply_transforms(self.transforms_list[i])

        if len(self.batch) == 1: self.batch = self.batch[0]

        self.offset = offset
        self.data_dir = data_dir
        self.filename = filename