Source code for ketos.data_handling.selection_table

# ================================================================================ #
#   Authors: Fabio Frazao and Oliver Kirsebom                                      #
#   Contact: fsfrazao@dal.ca, oliver.kirsebom@dal.ca                               #
#   Organization: MERIDIAN (https://meridian.cs.dal.ca/)                           #
#   Team: Data Analytics                                                           #
#   Project: ketos                                                                 #
#   Project goal: The ketos library provides functionalities for handling          #
#   and processing acoustic data and applying deep neural networks to sound        #
#   detection and classification tasks.                                            #
#                                                                                  #
#   License: GNU GPLv3                                                             #
#                                                                                  #
#       This program is free software: you can redistribute it and/or modify       #
#       it under the terms of the GNU General Public License as published by       #
#       the Free Software Foundation, either version 3 of the License, or          #
#       (at your option) any later version.                                        #
#                                                                                  #
#       This program is distributed in the hope that it will be useful,            #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#       GNU General Public License for more details.                               # 
#                                                                                  #
#       You should have received a copy of the GNU General Public License          #
#       along with this program.  If not, see <https://www.gnu.org/licenses/>.     #
# ================================================================================ #

""" selection_table module within the ketos library.

    This module provides functions for handling annotation tables and creating 
    selection tables. 

    A Ketos annotation table always has the column 'label'. 
    For call-level annotations, the table also contains the columns 'start' 
    and 'end', giving the start and end time of the call measured in seconds 
    since the beginning of the file. 
    The table may also contain the columns 'freq_min' and 'freq_max', giving the 
    minimum and maximum frequencies of the call in Hz, but this is not required.    
    The user may add any number of additional columns.
    Note that the table uses two levels of indices, the first index being the 
    filename and the second index being an integer to identify annotations 
    pertaining to the same file. 

    Here is a minimal example of an annotation table,

            +----------------------+-------+
            |                      | label |
            +-----------+----------+-------+
            | filename  | annot_id |       |
            +-----------+----------+-------+
            | file1.wav | 0        | 2     |
            +-----------+----------+-------+
            |           | 1        | 1     |
            +-----------+----------+-------+
            |           | 2        | 2     |
            +-----------+----------+-------+
            | file2.wav | 0        | 2     |
            +-----------+----------+-------+
            |           | 1        | 2     |
            +-----------+----------+-------+
            |           | 2        | 1     |
            +-----------+----------+-------+

    And here is a more extensive example with time information (call-level annotations) 
    and a few extra columns ('min_freq', 'max_freq' and 'file_time_stamp'),

            +----------------------+-------+------+-------+----------+----------+---------------------+
            |                      | start | end  | label | min_freq | max_freq | file_time_stamp     |
            +-----------+----------+-------+------+-------+----------+----------+---------------------+
            | filename  | annot_id |                                                                  |
            +-----------+----------+-------+------+-------+----------+----------+---------------------+
            | file1.wav | 0        | 7.0   | 8.1  | 2     | 180.6    | 294.3    | 2019-02-24 13:15:00 |
            +-----------+----------+-------+------+-------+----------+----------+---------------------+
            |           | 1        | 8.5   | 12.5 | 1     | 174.2    | 258.7    | 2019-02-24 13:15:00 |
            |           +----------+-------+------+-------+----------+----------+---------------------+
            |           | 2        | 13.1  | 14.0 | 2     | 183.4    | 292.3    | 2019-02-24 13:15:00 |
            +-----------+----------+-------+------+-------+----------+----------+---------------------+
            | file2.wav | 0        | 2.2   | 3.1  | 2     | 148.8    | 286.6    | 2019-02-24 13:30:00 |
            +-----------+----------+-------+------+-------+----------+----------+---------------------+
            |           | 1        | 5.8   | 6.8  | 2     | 156.6    | 278.3    | 2019-02-24 13:30:00 |
            |           +----------+-------+------+-------+----------+----------+---------------------+
            |           | 2        | 9.0   | 13.0 | 1     | 178.2    | 304.5    | 2019-02-24 13:30:00 |
            +-----------+----------+-------+------+-------+----------+----------+---------------------+

    Ketos selection tables also use two level of indices. The first index is a unique, integer identifier, 
    while the second index is the filename. Moreover, selection tables always contain the columns 'start' 
    and 'end' giving the start and end time of the selection window measured in seconds since the beginning 
    of the file. This structure allows selections to span multiple files. The user may add any number of 
    additional columns to a selection table.

    Here is a minimal example of a selection table,

            +--------------------+-------+------+
            |                    | start | end  |
            +--------+-----------+-------+------+
            | sel_id | filename  |              |
            +--------+-----------+-------+------+
            | 0      | file1.wav | 1.5   | 4.5  |
            +--------+-----------+-------+------+
            | 1      | file1.wav | 12.0  | 15.0 |
            +--------+-----------+-------+------+
            |        | file2.wav | 0.0   | 5.0  |
            +--------+-----------+-------+------+
            | 2      | file2.wav | 2.0   | 10.0 |
            +--------+-----------+-------+------+
            | 3      | file2.wav | 7.0   | 15.0 |
            +--------+-----------+-------+------+
"""

import os
import warnings
import numpy as np
import pandas as pd
from ketos.utils import str_is_int, fractional_overlap
from ketos.data_handling.data_handling import find_wave_files, find_files, parse_datetime
from ketos.audio.waveform import get_duration



[docs]
def unfold(table, sep=','):
    """ Unfolds rows containing multiple labels.

        Args:
            table: pandas DataFrame
                Annotation table.
            sep: str
                Character used to separate multiple labels.

        Returns:
            : pandas DataFrame
                Unfolded table
    """
    df = table    
    df = df.astype({'label': 'str'})
    s = df.label.str.split(",").apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = 'label'
    del df['label']
    df = df.join(s)
    return df




[docs]
def rename_columns(table, mapper):
    """ Renames the table headings to conform with the ketos naming convention.

        Args:
            table: pandas DataFrame
                Annotation table.
            mapper: dict
                Dictionary mapping the headings of the input table to the 
                standard ketos headings.

        Returns:
            : pandas DataFrame
                Table with new headings
    """
    return table.rename(columns=mapper)




[docs]
def empty_annot_table():
    """ Create an empty call-level annotation table

        Returns:
            df: pandas DataFrame
                Empty annotation table
    """
    df = pd.DataFrame(columns=['filename','label','start','end'])
    df = use_multi_indexing(df, 'annot_id')

    return df



def empty_selection_table():
    """ Create an empty selection table

        Returns:
            df: pandas DataFrame
                Empty selection table
    """
    df = pd.DataFrame(columns=['filename','label','start','end', 'annot_id'])
    df = use_multi_indexing(df, 'sel_id')
    return df



[docs]
def standardize(annotations=None, sep=',', labels='auto', unfold_labels=False, 
                label_sep=',', trim_table=False, datetime_format=None, table=None, path=None):
    """ Standardize the annotation table format.

        The input table can be passed as a pandas DataFrame or as the filename of a csv file.
        The table may have either a single label per row, in which case unfold_labels should be set 
        to False, or multiple labels per row (e.g. as a comma-separated list of values), in which 
        case unfold_labels should be set to True and label_sep should be specified.

        The table headings are renamed to conform with the ketos standard naming convention, following the 
        name mapping specified by the user. 

        Note that the standardized output table has two levels of indices, the first index being the 
        filename and the second index the annotation identifier. 

        The label mapping is stored as a class attribute named 'label_dict' within the output table 
        and may be retrieved with `df.attrs['label_dict']`.

        Required Columns:
            - 'filename': The name or path of the file associated with each annotation.
            - 'label': The label or category associated with each annotation. If `unfold_labels` is True,
                    this column may contain multiple labels separated by `label_sep`.

            Optional Columns (depending on usage):
            - 'start': The start time or position of the annotation.
            - 'end': The end time or position of the annotation.

        Args:
            annotations: str, pandas DataFrame
                If a string, it is assumed to be the path to a CSV file containing the annotation table.
                If a pandas DataFrame, it is used directly as the annotation table.
            sep: str
                Separator. Only relevant if filename is specified. Default is ",".
            labels: 'auto', None, dict, or list
                - 'auto' (default): All unique labels in the table are automatically mapped to integers starting from 0.
                - None: No label mapping is applied, labels are left as-is.
                - dict: A user-specified mapping of labels to integers (Note that ketos expects labels to be incremental integers starting with 0).
                - list: A subset of labels to map to integers starting from 0.
                Any unspecified label is mapped to -1.
            unfold_labels: bool
                Should be set to True if any of the rows have multiple labels and False otherwise (default).
            label_sep: str
                Character used to separate multiple labels. Only relevant if unfold_labels is set to True. Default is ",".
            trim_table: bool
                Keep only the columns prescribed by the Ketos annotation format and any additional columns specified 
                in the mapper dictionary.
            datetime_format: str
                String defining the date-time format. 
                Example: %d_%m_%Y* would capture "14_3_1999.txt".
                See https://pypi.org/project/datetime-glob/ for a list of valid directives.
                If specified, the method will look for a column named 'datetime' and, if found, attempt to parse the 
                values in this column. If your datetime column has a different name, use the `mapper` argument to change 
                its name to 'datetime'. If the method does not find a column named 'datetime' it will attempt to 
                parse the datetime information from the filename column.
            table: pandas DataFrame (deprecated)
                Deprecated. Use 'annotations' instead.
            path: str (deprecated)
                Deprecated. Use 'annotations' instead.

        Returns:
            df: pandas DataFrame
                Standardized annotation table
        
        Note:
            The function assumes that the necessary preprocessing (e.g., renaming columns to match the expected names) 
            has been done prior to calling this function. It is the responsibility of the user to ensure that the input 
            table conforms to the expected format.
    """
    if table is not None or path is not None:
        warnings.warn("The 'table' and 'path' arguments are deprecated and will be removed in a future version. "
                      "Use the 'annotations' argument instead.", DeprecationWarning)
        
        if table is not None:
            annotations = table
        elif path is not None:
            annotations = path
    # Determine if 'data' is a file path or a DataFrame
    if isinstance(annotations, str):
        df = pd.read_csv(annotations, sep=sep)
    elif isinstance(annotations, pd.DataFrame):
        df = annotations
    else:
        raise ValueError("Annotations must be a pandas DataFrame or a file path to a CSV file.")

    label_mapping = dict()
    # Handle label mapping based on the labels argument
    if labels == 'auto':
        unique_labels = sorted(pd.unique(df['label']))
        label_mapping = {label: i for i, label in enumerate(unique_labels)}
        df['label'] = df['label'].map(label_mapping).fillna(-1)
    elif isinstance(labels, list):
        label_mapping = {label: i for i, label in enumerate(labels)}
        df['label'] = df['label'].map(label_mapping).fillna(-1)
    elif isinstance(labels, dict):
        label_mapping = labels
        df['label'] = df['label'].map(label_mapping).fillna(-1)
    elif labels is not None:
        raise ValueError("Unsupported value for labels argument. Use 'auto', None, a list of labels, or a dict.")
    
    # keep only relevant columns
    if trim_table:
        df = trim(df)
    
    # Ensure the DataFrame contains 'filename' and 'label' columns
    required_columns = ['filename', 'label']
    missing_columns = [col for col in required_columns if col not in df.columns]
    assert not missing_columns, f"Missing required column(s): {', '.join(missing_columns)}"

    if unfold_labels:
        df = unfold(df, sep=label_sep)

    # always sort by filename (first) and start time (second)
    by = ['filename']
    if 'start' in df.columns.values: 
        by += ['start']
    df.sort_values(by=by, inplace=True, ignore_index=True)

    # parse datetime field
    if datetime_format is not None:
        if 'datetime' in df.columns.values:
            df['datetime'] = df['datetime'].apply(lambda x: parse_datetime(x, fmt=datetime_format))
        else:
            df['datetime'] = df.apply(lambda x: parse_datetime(os.path.basename(x.filename), fmt=datetime_format), axis=1)
    
    # transform to multi-indexing
    df = use_multi_indexing(df, 'annot_id')

    # store label dictionary as class attribute
    df.attrs["label_dict"] = label_mapping
    
    # enforce float for select columns
    float_cols = ['start','end','freq_min','freq_max']
    for c in float_cols:
        if c in df.columns.values:
            df[c] = df[c].astype(float)
        
    return df




[docs]
def use_multi_indexing(df, level_1_name):
    """ Change from single-level indexing to double-level indexing. 
        
        The first index level is the filename while the second 
        index level is a cumulative integer.

        Args:
            table: pandas DataFrame
                Singly-indexed table. Must contain a column named 'filename'. 

        Returns:
            table: pandas DataFrame
                Multi-indexed table.
    """
    df = df.set_index([df.filename, df.index])
    df = df.drop(['filename'], axis=1)
    df = df.sort_index()
    df.index = pd.MultiIndex.from_arrays(
        [df.index.get_level_values(0), df.groupby(level=0).cumcount()],
        names=['filename', level_1_name])

    return df




[docs]
def trim(table, extra_cols=None):
    """ Keep only the columns prescribed by the Ketos annotation format.

        Args:
            table: pandas DataFrame
                Annotation table. 
            extra_cols: list(str)
                Any additional columns that we wish to keep

        Returns:
            table: pandas DataFrame
                Annotation table, after removal of columns.
    """
    keep_cols = ['filename', 'label', 'start', 'end', 'freq_min', 'freq_max']
    if extra_cols is not None:
        keep_cols += extra_cols
    drop_cols = [x for x in table.columns.values if x not in keep_cols]
    table = table.drop(drop_cols, axis=1)
    return table



[docs]
def is_standardized(table, has_time=False, verbose=True):
    """ Check if the table has the correct indices and the minimum required columns.

        Args:
            table: pandas DataFrame
                Annotation table. 
            has_time: bool
                Require time information for each annotation, i.e. start and stop times.
            verbose: bool
                If True and the table is not standardized, print a message with an example table in the standard format.

        Returns:
            res: bool
                True if the table has the standardized Ketos format. False otherwise.
    """
    required_indices = ['filename', 'annot_id']
    required_cols = ['label']
    if has_time:
        required_cols = required_cols + ['start', 'end']

    mis_cols = [x for x in required_cols if x not in table.columns.values]
    res = (table.index.names == required_indices) and (len(mis_cols) == 0)

    message = """ Your table is not in the Ketos format.

            It should have two levels of indices: filename and annot_id.
            It should also contain at least the 'label' column.
            If your annotations have time information, these should appear in the 'start' and 'end' columns

            extra columns are allowed.

            Here is a minimum example:

                                 label
            filename  annot_id                    
            file1.wav 0          2
                      1          1
                      2          2
            file2.wav 0          2
                      1          2
                      2          1


            And here is a table with time information and a few extra columns ('min_freq', 'max_freq' and 'file_time_stamp')

                                 start   end  label  min_freq  max_freq  file_time_stamp
            filename  annot_id                    
            file1.wav 0           7.0   8.1      2    180.6     294.3    2019-02-24 13:15:00
                      1           8.5  12.5      1    174.2     258.7    2019-02-24 13:15:00
                      2          13.1  14.0      2    183.4     292.3    2019-02-24 13:15:00
            file2.wav 0           2.2   3.1      2    148.8     286.6    2019-02-24 13:30:00
                      1           5.8   6.8      2    156.6     278.3    2019-02-24 13:30:00
                      2           9.0  13.0      1    178.2     304.5    2019-02-24 13:30:00

    
    """
    if res == False and verbose == True:
        print(message)            

    return res



def _create_label_dict(labels, discard_labels=None, start_labels_at_1=False):
    """ Create label dictionary, following the convetion:

            * signal_labels are mapped to 1,2,3,...
            * backgr_labels are mapped to 0
            * discard_labels are mapped to -1

        Args:
            signal_labels: list, or list of lists
                Labels of interest. Will be mapped to 1,2,3,...
                Several labels can be mapped to the same integer by using nested lists. For example, 
                signal_labels=[A,[B,C]] would result in A being mapped to 1 and B and C both being mapped 
                to 2.
            backgr_labels: list
                Labels will be grouped into a common "background" class (0).
            discard_labels: list
                Labels will be grouped into a common "discard" class (-1).
            start_labels_at_1: bool
                Map labels to 1,2,3,... instead of 0,1,2,... Default is False.

        Returns:
            label_dict: dict
                Dict that maps old labels to new labels.

        This function is deprecated and will be removed in a future version.
        Users should transition to creating their own label mapping.
    """
    # Deprecation warning to alert users
    warnings.warn(
        "_create_label_dict is deprecated and will be removed in a future version. "
        "Users should transition to creating their own label mapping.",
        DeprecationWarning
    )

    label_dict = dict()  
    if discard_labels is not None:  
        for l in discard_labels: label_dict[l] = -1
    
    num = 1 if start_labels_at_1 else 0
    for l in labels:
        if isinstance(l, list):
            for ll in l:
                label_dict[ll] = num

        else:
            label_dict[l] = num

        num += 1

    return label_dict



[docs]
def label_occurrence(table):
    """ Identify the unique labels occurring in the table and determine how often 
        each label occurs.

        The input table must have the standardized Ketos format, see 
        :func:`data_handling.selection_table.standardize`. In particular, each 
        annotation should have only a single label value.

        Args:
            table: pandas DataFrame
                Input table.

        Results:
            occurrence: dict
                Dictionary where the labels are the keys and the values are the occurrences.
    """
    occurrence = table.groupby('label').size().to_dict()
    return occurrence



[docs]
def cast_to_str(labels, nested=False):
    """ Convert every label to str format. 

        If nested is set to True, a flattened version of the input 
        list is also returned.

        Args:
            labels: list
                Input labels
            nested: bool
                Indicate if the input list contains (or may contain) sublists.
                False by default. If True, a flattened version of the 
                list is also returned.

        Results:
            labels_str: list
                Labels converted to str format
            labels_str_flat: list
                Flattened list of labels. Only returned if nested is set to True.
    """
    if not nested:
        labels_str = [str(x) for x in labels]
        return labels_str

    else:
        labels_str = []
        labels_str_flat = []
        for x in labels:
            if isinstance(x, list):
                sublist = []
                for xx in x:
                    labels_str_flat.append(str(xx))
                    sublist.append(str(xx))

                labels_str.append(sublist)

            else:
                labels_str_flat.append(str(x))
                labels_str.append(str(x))

        return labels_str, labels_str_flat



[docs]
def select(annotations, length, step=0, min_overlap=0, center=False, discard_long=False, 
    keep_id=False, keep_freq=False, label=None, avoid_label=None, discard_outside=False, files=None):
    """ Generate a selection table by defining intervals of fixed length around 
        annotated sections of the audio data. Each selection created in this 
        way is characterized by a single, integer-valued, label.

        This approach to generating selections lends itself well to cases in which 
        the annotated sections are well separated and rarely overlap. If this is not 
        the case, you may find the related function :func:`data_handling.selection_table.select_by_segmenting` 
        more useful.

        By default all annotated sections are used for generating selections except 
        those with label -1 which are ignored. Use the `label` argument to only 
        generate selections for specific labels. 

        Conversely, the argument `avoid_label` can be used to ensure that the generated 
        selections do not overlap with annotated sections with specific labels. For example, 
        if `label=[1,2]` and `avoid_label=[4]`, selections will be generated for every 
        annotated section with label 1 or 2, but any selection that happens to overlap with 
        an annotated sections with label 4 will be discarded. 

        The input table must have the standardized Ketos format and contain call-level 
        annotations, see :func:`data_handling.selection_table.standardize`.

        The output table uses two levels of indexing, the first level being the 
        filename and the second level being a selection id.

        The generated selections have uniform length given by the `length` argument. Annotated 
        sections longer than the specified length will be cropped (unless discard_long=True) 
        whereas shorter sections will be extended to achieve the specified length. 

        The `step` and `min_overlap` arguments may be used to generate multiple, time-shifted 
        selections for every annotated sections.

        Note that the selections may have negative start times and/or end times 
        that exceed the file duration, unless discard_outside=True in which case only 
        selections with start times and end times within the file duration are returned.

        Args:
            annotations: pandas DataFrame
                Input table with call-level annotations.
            length: float
                Selection length in seconds.
            step: float
                Produce multiple selections for each annotated  section by shifting the selection 
                window in steps of length step (in seconds) both forward and backward in 
                time. The default value is 0.
            min_overlap: float
                Minimum required overlap between the selection and the annotated section, expressed 
                as a fraction of whichever of the two is shorter. Only used if step > 0. 
            center: bool
                Center annotations. Default is False.
            discard_long: bool
                Discard all annotations longer than the output length. Default is False.
            keep_id: bool
                For each generated selection, include the id of the annotation from which 
                the selection was generated.
            keep_freq: bool
                For each generated selection, include the min and max frequency, if known.
            label: int or list(int)
                Only create selections for annotated sections with these labels.
            avoid_label: int, list(int) or str
                Avoid overlap with annotated sections with these labels. If overlap is to be avoided 
                with all other labels but the labels specified by the `label` argument, set `avoid_label="ALL"`.
            discard_outside: bool
                Discard selections that extend beyond file duration. Requires that a file duration 
                table is specified via the `files` argument.
            files: pandas DataFrame
                Table with file durations in seconds. Must contain columns named 'filename' and 'duration'.
                Only required if `discard_outside=True`.

        Results:
            df: pandas DataFrame
                Output selection table.

        Example:
            >>> import pandas as pd
            >>> from ketos.data_handling.selection_table import select, standardize
            >>> 
            >>> #Load and inspect the annotations.
            >>> df = pd.read_csv("ketos/tests/assets/annot_001.csv")
            >>>
            >>> #Standardize annotation table format
            >>> df = standardize(df, labels={0:1, 1:2})
            >>> print(df)
                                start   end  label
            filename  annot_id                    
            file1.wav 0           7.0   8.1      2
                      1           8.5  12.5      1
                      2          13.1  14.0      2
            file2.wav 0           2.2   3.1      2
                      1           5.8   6.8      2
                      2           9.0  13.0      1
            >>> 
            >>> #Create a selection table by defining intervals of fixed 
            >>> #length around every annotation.
            >>> #Set the length to 3.0 sec and require a minimum overlap of 16%
            >>> #between selection and annotations.
            >>> #Also, create multiple time-shifted versions of the same selection
            >>> #using a step size of 1.0 sec.     
            >>> df_sel = select(df, length=3.0, step=1.0, min_overlap=0.16, center=True, keep_id=True) 
            >>> print(df_sel.round(2))
                              label  start    end  annot_id
            filename  sel_id                               
            file1.wav 0           2   5.05   8.05         0
                      1           1   6.00   9.00         1
                      2           2   6.05   9.05         0
                      3           1   7.00  10.00         1
                      4           2   7.05  10.05         0
                      5           1   8.00  11.00         1
                      6           1   9.00  12.00         1
                      7           1  10.00  13.00         1
                      8           1  11.00  14.00         1
                      9           2  11.05  14.05         2
                      10          1  12.00  15.00         1
                      11          2  12.05  15.05         2
                      12          2  13.05  16.05         2
            file2.wav 0           2   0.15   3.15         0
                      1           2   1.15   4.15         0
                      2           2   2.15   5.15         0
                      3           2   3.80   6.80         1
                      4           2   4.80   7.80         1
                      5           2   5.80   8.80         1
                      6           1   6.50   9.50         2
                      7           1   7.50  10.50         2
                      8           1   8.50  11.50         2
                      9           1   9.50  12.50         2
                      10          1  10.50  13.50         2
                      11          1  11.50  14.50         2
                      12          1  12.50  15.50         2
    """
    if len(annotations) == 0:
        return empty_selection_table()

    df = annotations.copy()
    df['annot_id'] = df.index.get_level_values(1)

    # check that input table has expected format
    assert is_standardized(df, has_time=True), 'Annotation table appears not to have the expected structure.'

    # select labels
    if label is not None:
        if isinstance(label, int): label = [label]
        df = df[df['label'].isin(label)]

    # discard annotations with label -1
    df = df[df['label'] != -1]

    # compute length of every annotation
    df['length'] = df['end'] - df['start']

    # discard annotations longer than the requested length
    if discard_long:
        df = df[df['length'] <= length]

    # We need to ensure that the annotation is valid, that is, the start time must be smaller than end time. 
    # Otherwise we skip (remove) the annotation and throw a warning
    negative_length = df[df['length'] < 0] # select rows with the issue to issue warnings
    
    if (len(negative_length.index) > 0):
        df = df[df['length'] >= 0] # remove the rows from the dataframe

        for idx,row in negative_length.iterrows():
            warnings.warn("File {0}, annotation {1} has a start time ({2}) greater than end time ({3}). Skipping annotation".format(idx[0], idx[1], row['start'], row['end']), category=UserWarning, stacklevel=2)  

    # number of annotations
    N = len(df)

    # alignment of new annotations relative to original ones
    if center:
        df['start_new'] = df['start'] + 0.5 * (df['length'] - length)
    else:
        df['start_new'] = df['start'] + np.random.random_sample(N) * (df['length'] - length)

    # create multiple time-shited instances of every annotation
    if step > 0:
        df_new = None
        for idx,row in df.iterrows():
            t = row['start_new']
            df_shift = time_shift(annot=row, time_ref=t, length=length, min_overlap=min_overlap, step=step)
            df_shift['filename'] = idx[0]

            if df_new is None:
                df_new = df_shift
            else:
                df_new = pd.concat([df_new, df_shift])

        # sort by filename and offset
        df = df_new.sort_values(by=['filename','start_new'], axis=0, ascending=[True,True]).reset_index(drop=True)

        # transform to multi-indexing
        df = use_multi_indexing(df, 'sel_id')

    # rename index
    df.index.rename('sel_id', level=1, inplace=True) 

    # drop old/temporary columns, and rename others
    df = df.drop(['start', 'end', 'length'], axis=1)
    df = df.rename(columns={"start_new": "start"})
    df['end'] = df['start'] + length

    # keep annotation id
    if not keep_id:
        df = df.drop(columns=['annot_id'])
    else:
        # re-order columns so annot_it appears last
        cols = df.columns.values.tolist()
        p = cols.index('annot_id')
        cols_new = cols[:p] + cols[p+1:] + ['annot_id']
        df = df[cols_new]
        df = df.astype({'annot_id': int}) #ensure annot_id is int

    # ensure label is integer
    df = df.astype({'label':int})

    # discard selections that overlap with unwanted annotations
    if avoid_label is not None:

        if isinstance(avoid_label, int): 
            avoid_label = [avoid_label]

        elif isinstance(avoid_label, str) and avoid_label.lower() == "all" and label is not None:
            labels = pd.unique(annotations.label)
            avoid_label = labels[~np.isin(labels,label)]
            avoid_label = avoid_label.tolist()

        if isinstance(avoid_label, list):
            def func(y, start, end, label):
                y = y[(y.label.isin(label)) & (y.end>=start) & (y.start<=end)]
                return len(y)
            
            df['overlap'] = df.apply(lambda x: func(annotations.loc[x.name[0]], label=avoid_label, start=x.start, end=x.end), axis=1)
            df = df[df['overlap']==0]
            df = df.drop(columns=['overlap'])
        
    # discard selections that extend beyond duration of file
    if discard_outside:
        if files is None:
            warnings.warn("discard_outside=True requires files to be specified")
        else:
            files = files.set_index('filename')
            df['outside'] = df.apply(lambda x: (x.start < 0) or (x.end > files.loc[x.name[0]].duration), axis=1)
            df = df[df['outside']==False]
            df = df.drop(columns=['outside'])

    if not keep_freq:
        df = df.drop(columns=['freq_min','freq_max'], errors='ignore')

    return df



[docs]
def time_shift(annot, time_ref, length, step, min_overlap):
    """ Create multiple instances of the same selection by stepping in time, both 
        forward and backward.

        The time-shifted instances are returned in a pandas DataFrame with the same columns as the 
        input annotation, plus a column named 'start_new' containing the start times 
        of the shifted instances.

        Args:
            annot: pandas Series or dict
                Reference annotation. Must contain the labels/keys 'start' and 'end'.
            time_ref: float
                Reference time used as starting point for the stepping.
            length: float
                Output annotation length in seconds.
            step: float
                Produce multiple instances of the same selection by shifting the annotation 
                window in steps of length step (in seconds) both forward and backward in 
                time. The default value is 0.
            min_overlap: float
                Minimum required overlap between the selection intervals and the original 
                annotation, expressed as a fraction of whichever is smaller, the annotation 
                duration or the selection length.   

        Results:
            df: pandas DataFrame
                Output annotation table. The start times of the time-shifted annotations are 
                stored in the column 'start_new'.

        Example:
            >>> import pandas as pd
            >>> from ketos.data_handling.selection_table import time_shift
            >>> 
            >>> #Create a single 2-s long annotation
            >>> annot = {'filename':'file1.wav', 'label':1, 'start':12.0, 'end':14.0}
            >>>
            >>> #Step across this annotation with a step size of 0.2 s, creating 1-s long annotations that 
            >>> #overlap by at least 50% with the original 2-s annotation 
            >>> df = time_shift(annot, time_ref=13.0, length=1.0, step=0.2, min_overlap=0.5)
            >>> print(df.round(2))
                filename  label  start   end  start_new
            0  file1.wav      1   12.0  14.0       11.6
            1  file1.wav      1   12.0  14.0       11.8
            2  file1.wav      1   12.0  14.0       12.0
            3  file1.wav      1   12.0  14.0       12.2
            4  file1.wav      1   12.0  14.0       12.4
            5  file1.wav      1   12.0  14.0       12.6
            6  file1.wav      1   12.0  14.0       12.8
            7  file1.wav      1   12.0  14.0       13.0
            8  file1.wav      1   12.0  14.0       13.2
            9  file1.wav      1   12.0  14.0       13.4
    """
    if length <= 0:
        raise AssertionError("Length must be positive and greater than zero, found {0}".format(length))

    if isinstance(annot, dict):
        row = pd.Series(annot)
    elif isinstance(annot, pd.Series):
        row = annot.copy()
    
    row['start_new'] = time_ref
    rows_new = [row]

    # step backwards and forwards
    for sign in [-1, 1]:
        counter = 1
        while True:
            t0 = time_ref + sign * counter * step
            o = fractional_overlap(a=(t0,t0+length), b=(row['start'], row['end']))
            if o < min_overlap: 
                break
            ri = row.copy()
            ri['start_new'] = t0
            rows_new.append(ri)
            counter += 1

    # create DataFrame
    df = pd.DataFrame(rows_new)

    # sort according to new start time
    df = df.sort_values(by=['start_new'], axis=0, ascending=[True]).reset_index(drop=True)

    return df



[docs]
def file_duration_table(path, search_subdirs=False, datetime_format=None):
    """ Create file duration table.

        Args:
            path: str
                Path to folder with audio files with extensions wav, WAV, flac, FLAC.
            search_subdirs: bool
                If True, search include also any audio files in subdirectories.
                Default is False.
            datetime_format: str
                String defining the date-time format. 
                Example: %d_%m_%Y* would capture "14_3_1999.txt".
                See https://pypi.org/project/datetime-glob/ for a list of valid directives.
                If specified, the method will attempt to parse the datetime information from the filename.

        Returns:
            df: pandas DataFrame
                File duration table. Columns: filename, duration, (datetime)
    """
    paths = find_files(path=path, return_path=True, search_subdirs=search_subdirs, substr=['.wav', '.WAV', '.flac', '.FLAC'])
    durations = get_duration([os.path.join(path,p) for p in paths])
    df = pd.DataFrame({'filename':paths, 'duration':durations})
    if datetime_format is None:
        return df

    df['datetime'] = df.apply(lambda x: parse_datetime(os.path.basename(x.filename), fmt=datetime_format), axis=1)
    return df



[docs]
def create_rndm_selections(files, length, num, label=0, annotations=None, no_overlap=False, trim_table=False, buffer=0):
    """ Create selections of uniform length, randomly distributed across the 
        data set and not overlapping with any annotations. The created selections
        will have a label value defined by the 'label' parameter.

        The random sampling is performed without regard to already created 
        selections. Therefore, it is in principle possible that some of the created 
        selections will overlap, although in practice this will only occur with very 
        small probability, unless the number of requested selections (num) is very 
        large and/or the (annotation-free part of) the data set is small in size.

        To avoid any overlap, set the 'no_overlap' to True, but note that this can 
        lead to longer execution times.

        Use the 'buffer' argument to ensure a minimum separation between
        selections and the annotated segments. This can be useful if the annotation 
        start and end times are not always fully accurate.

        Args:
            files: pandas DataFrame
                Table with file durations in seconds. 
                Should contain columns named 'filename' and 'duration'.
            length: float
                Selection length in seconds.
            num: int
                Number of selections to be created.
            label: int
                Value to be assigned to the created selections.
            annotations: pandas DataFrame
                Annotation table. Optional.
            no_overlap: bool
                If True, randomly selected segments will have no overlap.
            trim_table: bool
                Keep only the columns prescribed by the Ketos annotation format.
            buffer: float
                Minimum separation in seconds between the background selections and the annotated segments.
                The default value is zero.

        Returns:
            table_backgr: pandas DataFrame
                Output selection table.

        Example:
            >>> import pandas as pd
            >>> import numpy as np
            >>> from ketos.data_handling.selection_table import select
            >>> 
            >>> #Ensure reproducible results by fixing the random number generator seed.
            >>> np.random.seed(3)
            >>> 
            >>> #Load and inspect the annotations.
            >>> df = pd.read_csv("ketos/tests/assets/annot_001.csv")
            >>> print(df)
                filename  start   end  label
            0  file1.wav    7.0   8.1      1
            1  file1.wav    8.5  12.5      0
            2  file1.wav   13.1  14.0      1
            3  file2.wav    2.2   3.1      1
            4  file2.wav    5.8   6.8      1
            5  file2.wav    9.0  13.0      0
            >>>
            >>> #Standardize annotation table format
            >>> df = standardize(annotations=df, labels={0:1, 1:2})  # Standardize annotation table format (we want to create background random segments with label 0, so lets map the labels we have to 1 and 2)
            >>> print(df)
                                start   end  label
            filename  annot_id                    
            file1.wav 0           7.0   8.1      2
                      1           8.5  12.5      1
                      2          13.1  14.0      2
            file2.wav 0           2.2   3.1      2
                      1           5.8   6.8      2
                      2           9.0  13.0      1
            >>>
            >>> #Enter file durations into a pandas DataFrame
            >>> file_dur = pd.DataFrame({'filename':['file1.wav','file2.wav','file3.wav',], 'duration':[18.,20.,15.]})
            >>> 
            >>> #Create randomly sampled background selection with fixed 3.0-s length.
            >>> df_bgr = create_rndm_selections(annotations=df, files=file_dur, length=3.0, num=12, trim_table=True) 
            >>> print(df_bgr.round(2))
                              start    end  label
            filename  sel_id                     
            file1.wav 0        3.38   6.38      0
                      1        3.89   6.89      0
            file2.wav 0       16.52  19.52      0
            file3.wav 0        0.29   3.29      0
                      1        2.77   5.77      0
                      2        3.23   6.23      0
                      3        5.49   8.49      0
                      4        5.63   8.63      0
                      5        6.69   9.69      0
                      6        6.71   9.71      0
                      7        8.18  11.18      0
                      8       10.33  13.33      0
    """
    if len(files) == 0:
        return empty_selection_table()

    assert isinstance(label, int), 'label is not int. Found {0}'.format(type(label))

    # compute lengths, and discard segments shorter than requested length
    c = files[['filename','duration']]

    if 'offset' in files.columns.names: c['offset'] = files['offset']
    else: c['offset'] = 0

    c.reset_index(drop=True, inplace=True)
    c['length'] = c['duration'] - length
    c = c[c['length'] >= 0]

    start_list, end_list, filename_list = [], [], []
    # Converting data from pandas df to lists and numpy array to use inside while loop (Much more efficience than accessing pandas row by row)
    durations = files['duration'].to_numpy()
    lengths = c['length'].to_numpy()
    offsets = c['offset'].to_numpy()
    filenames = c['filename'].tolist()

    cnt = 0
    probabilities = durations/durations.sum()

    # randomply sample
    # Loop until we achieve the desired number of samples
    while cnt < num:
        # Randomly choose a file to sample from
        # We want to sample from files with a longer duration with a higher probability than files with a lower duration.
        # Create array of indexes of size the number of segments to generate
        indices = np.random.choice(len(c), size=num, replace=True, p=probabilities)

        # Explanation of the following loop: The reason of using a nested for loop here is purely for efficiency.
        # It could be removed and instead of generating an array of indexes we could reandomly sample one index at a time.
        # However, random.choice is the most computationaly expensive operation here with O(n + n log m ) when p is specified
        # By reducing the amount of times we have to call it we are drastically reducing computaitonal time.
        for idx in indices:
            # Randomly sample a segment of duration = length from the timeseries
            t = np.random.random_sample() * lengths[idx]
            start = t + offsets[idx]
            end   = start + length
            fname = filenames[idx]

            # If given, gheck if the sampled segment does not overlap with an annotation
            if annotations is not None:
                q = query(annotations, filename=fname, start=start-buffer, end=end+buffer)
                if len(q) > 0: continue
            
            # If set, check if segments do not overlap with each other
            if no_overlap and cnt > 0:
                # Query requires passing a df as the first argument, therefore lets create a temporary df
                tmp_df = pd.DataFrame({'start': start_list, 'end': end_list, 'filename': filename_list})
                q = query(tmp_df.set_index(tmp_df.filename), filename=fname, start=start, end=end)
                if len(q) > 0: continue

            start_list.append(start)
            end_list.append(end)
            filename_list.append(fname)
            cnt += 1

            if cnt == num:
                break

    # Create Pandas df at the end
    df = pd.DataFrame({'start': start_list, 'end': end_list, 'filename': filename_list})
    # We want to keep all columns that were present in the files dataframe. This is a classic inner join case on the filename
    df = pd.DataFrame.merge(df, files, on='filename')
    
    # sort by filename and offset
    df = df.sort_values(by=['filename','start'], axis=0, ascending=[True,True]).reset_index(drop=True)

    # re-order columns
    col_names = ['filename','start','end']
    if not trim_table:
        names = df.columns.values.tolist()
        for name in col_names: names.remove(name)
        col_names += names

    df = df[col_names]

    df['label'] = label #add label

    # transform to multi-indexing
    df = use_multi_indexing(df, 'sel_id')

    return df



[docs]
def random_choice(df, siz):
    """ Randomly select a specified number of elements from a table.

        Args:
            df: pandas DataFrame
                Selection table or annotation table
            siz: int
                Number of elements to be selected

        Returns:
            sel: pandas DataFrame
                Reduced table
    """

    name_id = 'sel_id' if 'sel_id' in df.index.names else 'annot_id'
    n = min(siz, len(df))
    idx = np.sort(np.random.choice(np.arange(len(df), dtype=int), size=n, replace=False)).tolist()
    df = df.reset_index()
    df = df.loc[idx]
    df = df.set_index([df.filename, df[name_id]])
    df = df.drop(['filename', name_id], axis=1)
    df = df.sort_index()
    return df



[docs]
def select_by_segmenting(files, length, annotations=None, step=None,
    pad=True, discard_empty=False, keep_only_empty=False, label_empty=0, 
    avoid_label=None):
    """ Generate a selection table by stepping across the audio files, using a fixed 
        step size (step) and fixed selection window size (length). 
        
        Unlike the :func:`data_handling.selection_table.select` method, selections 
        created by this method are not characterized by a single, integer-valued 
        label, but rather a list of annotations (which can have any length, including zero).

        Therefore, the method returns not one, but two tables: A selection table indexed by 
        filename and segment id, and an annotation table indexed by filename, segment id, 
        and annotation id.

        However, if `keep_only_empty=True` only a selection table is returned. This table 
        has a column named `label` with all entries having the same value, as specified via
        the `label_empty` argument.

        Args:
            files: pandas DataFrame
                Table with file durations in seconds. 
                Should contain columns named 'filename' and 'duration'.
            length: float
                Selection length in seconds.
            annotations: pandas DataFrame
                Annotation table.
            step: float
                Selection step size in seconds. If None, the step size is set 
                equal to the selection length.
            pad: bool
                If True (default), the last selection window is allowed to extend 
                beyond the endpoint of the audio file.
            discard_empty: bool
                If True, only selection that contain annotations will be used. 
                If False (default), all selections are used.
            keep_only_empty: bool
                If True, only selections *without* any annotations are used, and 
                only the selections table is returned. Default is False. 
            label_empty: int
                Only relevant if keep_only_empty is True. Value to be assigned to
                selections without annotations. Default is 0.
            avoid_label: int or list(int)
                If specified, only selections without annotations with these labels 
                are used.

        Returns:
            sel: pandas DataFrame
                Selection table
            annot: pandas DataFrame
                Annotations table. Only returned if annotations is specified 
                and keep_only_empty is False.

        Example:
            >>> import pandas as pd
            >>> from ketos.data_handling.selection_table import select_by_segmenting, standardize
            >>> 
            >>> #Load and inspect the annotations.
            >>> annot = pd.read_csv("ketos/tests/assets/annot_001.csv")
            >>>
            >>> #Standardize annotation table format
            >>> annot = standardize(annot, labels={0:1, 1:2})
            >>> print(annot)
                                start   end  label
            filename  annot_id                    
            file1.wav 0           7.0   8.1      2
                      1           8.5  12.5      1
                      2          13.1  14.0      2
            file2.wav 0           2.2   3.1      2
                      1           5.8   6.8      2
                      2           9.0  13.0      1
            >>>
            >>> #Create file table
            >>> files = pd.DataFrame({'filename':['file1.wav', 'file2.wav', 'file3.wav'], 'duration':[11.0, 19.2, 15.1]})
            >>> print(files)
                filename  duration
            0  file1.wav      11.0
            1  file2.wav      19.2
            2  file3.wav      15.1
            >>>
            >>> #Create a selection table by splitting the audio data into segments of 
            >>> #uniform length. The length is set to 10.0 sec and the step size to 5.0 sec.
            >>> sel = select_by_segmenting(files=files, length=10.0, annotations=annot, step=5.0) 
            >>> #Inspect the selection table
            >>> print(sel[0].round(2))
                              start   end
            filename  sel_id             
            file1.wav 0         0.0  10.0
                      1         5.0  15.0
            file2.wav 0         0.0  10.0
                      1         5.0  15.0
                      2        10.0  20.0
            file3.wav 0         0.0  10.0
                      1         5.0  15.0
                      2        10.0  20.0
            >>> #Inspect the annotations
            >>> print(sel[1].round(2))
                                       start   end  label
            filename  sel_id annot_id                    
            file1.wav 0      0           7.0   8.1      2
                             1           8.5  12.5      1
                      1      0           2.0   3.1      2
                             1           3.5   7.5      1
                             2           8.1   9.0      2
                      2      1          -1.5   2.5      1
                             2           3.1   4.0      2
            file2.wav 0      0           2.2   3.1      2
                             1           5.8   6.8      2
                             2           9.0  13.0      1
                      1      1           0.8   1.8      2
                             2           4.0   8.0      1
                      2      2          -1.0   3.0      1
    """
    if step is None:
        step = length

    # check that the annotation table has expected format
    if annotations is not None:
        assert is_standardized(annotations, has_time=True), 'Annotation table appears not to have the expected structure.'
        
        annotations = annotations[annotations.label != -1] #discard annotations with label -1

    # create selections table by segmenting
    sel = segment_files(files, length=length, step=step, pad=pad)

    # max number of segments
    num_segs = sel.index.get_level_values(1).max() + 1

    # create annotation table by segmenting
    if annotations is not None:
        annot = segment_annotations(annotations, num=num_segs, length=length, step=step)

        # get the indices of those selections that have annotations associated with them
        indices = list(set([(a, b) for a, b, c in annot.index.tolist()]))

        if keep_only_empty: 
            sel = sel.loc[~sel.index.isin(indices)].sort_index()
            sel['label'] = label_empty
            return sel

        if discard_empty: 
            sel = sel.loc[indices].sort_index()

        if avoid_label is not None:
            if isinstance(avoid_label, int): avoid_label = [avoid_label]
            annot_avoid = annot[annot.label.isin(avoid_label)]
            indices_avoid = list(set([(a, b) for a, b, c in annot_avoid.index.tolist()]))
            sel = sel.loc[~sel.index.isin(indices_avoid)].sort_index()

        return sel, annot
            
    else:
        return sel



[docs]
def segment_files(table, length, step=None, pad=True):
    """ Generate a selection table by stepping across the audio files, using a fixed 
        step size (step) and fixed selection window size (length). 

        Args:
            table: pandas DataFrame
                File duration table.
            length: float
                Selection length in seconds.
            step: float
                Selection step size in seconds. If None, the step size is set 
                equal to the selection length.
            pad: bool
                If True (default), the last selection window is allowed to extend 
                beyond the endpoint of the audio file.

        Returns:
            df: pandas DataFrame
                Selection table
    """
    if step is None:
        step = length

    # compute number of segments for each file
    table['num'] = (table['duration'] - length) / step + 1
    table.num = table.num.apply(lambda x: np.maximum(x, 0))
    if pad: 
        table.num = table.num.apply(np.ceil).astype(int)
    else:
        table.num = table.num.apply(np.floor).astype(int)

    df = table.loc[table.index.repeat(table.num)]
    df.set_index(keys=['filename'], inplace=True, append=True)
    df = df.swaplevel()
    df = df.sort_index()
    df.index = pd.MultiIndex.from_arrays(
        [df.index.get_level_values(0), df.groupby(level=0).cumcount()],
        names=['filename', 'sel_id'])

    df['start'] = df.index.get_level_values(1) * step
    df['end'] = df['start'] + length
    df.drop(columns=['num','duration'], inplace=True)

    return df



[docs]
def segment_annotations(table, num, length, step=None, compute_overlap=False):
    """ Generate a segmented annotation table by stepping across the audio files, using a fixed 
        step size (step) and fixed selection window size (length). 
        
        Args:
            table: pandas DataFrame
                Annotation table.
            num: int
                Number of segments
            length: float
                Selection length in seconds.
            step: float
                Selection step size in seconds. If None, the step size is set 
                equal to the selection length.
            compute_overlap: bool
                If True, the fractional overlap between the selection window and the annotation 
                will be computed and added as an extra column in the output table. Default is False.

        Returns:
            df: pandas DataFrame
                Annotations table
    """
    if step is None:
        step = length

    segs = []
    for n in range(num):
        # select annotations that overlap with segment
        t1 = n * step
        t2 = t1 + length
        a = table[(table.start < t2) & (table.end > t1)].copy()
        if len(a) > 0:
            # shift and crop annotations
            if compute_overlap:
                a['overlap'] = a.apply(lambda r: fractional_overlap(a=(t1,t2), b=(r.start, r.end)), axis=1)

            a['start'] -= t1 
            a['end'] -= t1 
            a['sel_id'] = n #map to segment
            segs.append(a)

    df = pd.concat(segs)
    df.set_index(keys=['sel_id'], inplace=True, append=True)
    df = df.swaplevel()
    df = df.sort_index()
    return df



[docs]
def query(selections, annotations=None, filename=None, label=None, start=None, end=None):
    """ Query selection table for selections from certain audio files 
        and/or with certain labels.

        Args:
            selections: pandas DataFrame
                Selections table
            annotations: pandas DataFrame
                Annotations table. Optional.
            filename: str or list(str)
                Filename(s)
            label: int or list(int)
                Label(s)
            start: float
                Earliest end time in seconds
            end: float
                Latest start time in seconds

        Returns:
            : pandas DataFrame or tuple(pandas DataFrame, pandas DataFrame)
            Selection table, accompanied by an annotation table if an input 
            annotation table is provided.
    """
    if annotations is None:
        return query_labeled(selections, filename, label, start, end)
    else:
        return query_annotated(selections, annotations, filename, label, start, end)



[docs]
def query_labeled(table, filename=None, label=None, start=None, end=None):
    """ Query selection table for selections from certain audio files 
        and/or with certain labels.

        Args:
            table: pandas DataFrame
                Annotations table or Selections table with a 'label' column.
            filename: str or list(str)
                Filename(s)
            label: int or list(int)
                Label(s)
            start: float
                Earliest end time in seconds
            end: float
                Latest start time in seconds

        Returns:
            df: pandas DataFrame
            Selection table
    """
    df = table
    if filename is not None:
        if isinstance(filename, str): filename = [filename]

        filename = [f for f in filename if f in df.index]

        if len(filename) == 0: 
            return df.iloc[0:0]
        else: 
            df = df.loc[filename]

    if label is not None:
        if not isinstance(label, list):
            label = [label]

        df = df[df.label.isin(label)]

    if start is not None:
        df = df[df.end > start]

    if end is not None:
        df = df[df.start < end]

    return df



[docs]
def query_annotated(selections, annotations, filename=None, label=None, start=None, end=None):
    """ Query selection table for selections from certain audio files 
        and/or with certain labels.

        Args:
            selections: pandas DataFrame
                Selections table.
            annotations: pandas DataFrame
                Annotations table.
            filename: str or list(str)
                Filename(s)
            label: int or list(int)
                Label(s)
            start: float
                Earliest end time in seconds
            end: float
                Latest start time in seconds

        Returns:
            df1,df2: tuple(pandas DataFrame, pandas DataFrame)
                Selection table and annotation table
    """
    df1 = selections
    df2 = annotations

    df1 = query_labeled(df1, filename=filename, start=start, end=end)
    df2 = query_labeled(df2, filename=filename, label=label, start=start, end=end)

    indices = list(set([x[:-1] for x in df2.index.tolist()]))
    df1 = df1.loc[indices].sort_index()

    return df1, df2



[docs]
def aggregate_duration(table, label=None):
    """ Compute the aggregate duration of the annotations.

        Overlapping segments are only counted once.

        Args:
            table: pandas DataFrame
                Annotations table or Selections table
            label: int or list(int)
                Label(s). Optional

        Returns:
            agg_dur: float
                Aggregate duration in seconds
    """
    df = query_labeled(table=table, label=label)
    df.sort_index(axis='index', level=[0, 1], inplace=True)

    agg_dur = 0
    
    for index, row in df.iterrows():
        filename = index[0]
        dur = row['end'] - row['start']

        if agg_dur == 0 or filename != filename_prev:
            agg_dur += dur
            end_prev = row['end']
        
        else:
            if row['start'] >= end_prev:
                agg_dur += dur
                end_prev = row['end']

            else:
                extend = max(0, row['end'] - end_prev)                
                agg_dur += extend
                end_prev += extend

        filename_prev = filename

    return agg_dur