Source code for ketos.data_handling.data_handling

# ================================================================================ #
#   Authors: Fabio Frazao and Oliver Kirsebom                                      #
#   Contact: fsfrazao@dal.ca, oliver.kirsebom@dal.ca                               #
#   Organization: MERIDIAN (https://meridian.cs.dal.ca/)                           #
#   Team: Data Analytics                                                           #
#   Project: ketos                                                                 #
#   Project goal: The ketos library provides functionalities for handling          #
#   and processing acoustic data and applying deep neural networks to sound        #
#   detection and classification tasks.                                            #
#                                                                                  #
#   License: GNU GPLv3                                                             #
#                                                                                  #
#       This program is free software: you can redistribute it and/or modify       #
#       it under the terms of the GNU General Public License as published by       #
#       the Free Software Foundation, either version 3 of the License, or          #
#       (at your option) any later version.                                        #
#                                                                                  #
#       This program is distributed in the hope that it will be useful,            #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#       GNU General Public License for more details.                               # 
#                                                                                  #
#       You should have received a copy of the GNU General Public License          #
#       along with this program.  If not, see <https://www.gnu.org/licenses/>.     #
# ================================================================================ #

""" Data handling module within the ketos library

    This module provides utilities to load and handle data files.
"""
import numpy as np
import pandas as pd
import os
import math
import errno
from subprocess import call
import soundfile as sf
from ketos.utils import tostring
import datetime
import datetime_glob
import re


[docs]def rel_path_unix(path, start=None): """ Return a relative unix filepath to path either from the current directory or from an optional start directory. Args: path: str Path. Can be unix or windows format. start: str Optional start directory. Can be unix or windows format. Returns: u: str Relative unix filepath Examples: >>> from ketos.data_handling.data_handling import rel_path_unix >>> path = "/home/me/documents/projectX/file1.pdf" >>> start = "/home/me/documents/" >>> u = rel_path_unix(path, start) >>> print(u) /projectX/ """ rel = os.path.relpath(path, start) h,t = os.path.split(rel) u = '/' while len(h) > 0: h,t = os.path.split(h) u = '/' + t + u return u
[docs]def parse_datetime(to_parse, fmt=None, replace_spaces='0'): """ Parse date-time data from string. Returns None if parsing fails. If the year is encoded with only two figures, it is parsed to the most recent past year ending in those two figures. For example, 45 would be parsed to 1945 (assuming that the program is being executed in a year earlier than 2045). Args: to_parse: str String with date-time data to parse. fmt: str String defining the date-time format. Example: %d_%m_%Y* would capture "14_3_1999.txt" See https://pypi.org/project/datetime-glob/ for a list of valid directives. In addition to the directives allowed by the datetime-glob package, it is also possible to specify %S.%ms for milliseconds. Note that the milliseconds (%ms) must follow the seconds (%S) separated by a period (.) or underscore (_) and can only be followed by an asterisk (*) or nothing. replace_spaces: str If string contains spaces, replaces them with this string Returns: datetime: datetime object Examples: >>> #This will parse dates in the day/month/year format, >>> #separated by '/'. It will also ignore any text after the year, >>> # (such as a file extension ) >>> >>> from ketos.data_handling.data_handling import parse_datetime >>> fmt = "%d/%m/%Y*" >>> result = parse_datetime("10/03/1942.txt", fmt) >>> result.year 1942 >>> result.month 3 >>> result.day 10 >>> >>> # Now with the time (hour:minute:second) separated from the date by an underscore >>> fmt = "%H:%M:%S_%d/%m/%Y*" >>> result = parse_datetime("15:43:03_10/03/1918.wav", fmt) >>> result.year 1918 >>> result.month 3 >>> result.day 10 >>> result.hour 15 >>> result.minute 43 >>> result.second 3 """ # millisecond millisecond = False for sep in [".","_"]: if f'%S{sep}%ms' in fmt: millisecond = True fmt = fmt.replace(f'%S{sep}%ms', '%S*') # replace spaces to_parse = to_parse.replace(' ', replace_spaces) if fmt is not None: matcher = datetime_glob.Matcher(pattern=fmt) match = matcher.match(path=to_parse) if match is None: return None else: dt = match.as_datetime() if dt > datetime.datetime.now() and "%y" in fmt: dt = dt.replace(year=dt.year-100) if millisecond: dt_str = dt.strftime(fmt) dt_str = dt_str.replace('*','') i = to_parse.rfind(dt_str) + len(dt_str) + 1 ms_str = to_parse[i:i+3] ms = int(ms_str) dt += datetime.timedelta(microseconds=1e3*ms) return dt return None
[docs]def find_files(path, substr, return_path=True, search_subdirs=False, search_path=False): """ Find all files in the specified directory containing the specified substring in their file name Args: path: str Directory path substr: str Substring contained in file name return_path: bool If True, path to each file, relative to the top directory. If false, only return the filenames search_subdirs: bool If True, search all subdirectories search_path: bool Search for substring occurrence in relative path rather than just the filename Returns: files: list (str) Alphabetically sorted list of file names Examples: >>> from ketos.data_handling.data_handling import find_files >>> >>> # Find files that contain 'super' in the name; >>> # Do not return the relative path >>> find_files(path="ketos/tests/assets", substr="super", return_path=False) ['super_short_1.wav', 'super_short_2.wav'] >>> >>> # find all files with '.h5" in the name >>> # Return the relative path >>> find_files(path="ketos/tests/", substr="super", search_subdirs=True) ['assets/super_short_1.wav', 'assets/super_short_2.wav'] """ # find all files all_files = [] if search_subdirs: for dirpath, _, files in os.walk(path): if return_path: all_files += [os.path.relpath(os.path.join(dirpath, f), path) for f in files] else: all_files += files else: all_files = os.listdir(path) # select those that contain specified substring if isinstance(substr, str): substr = [substr] files = [] for f in all_files: for ss in substr: if search_path: s = f else: s = os.path.basename(f) if ss in s: files.append(f) break # sort alphabetically files.sort() return files
[docs]def find_wave_files(path, return_path=True, search_subdirs=False, search_path=False): """ Find all wave files in the specified directory Args: path: str Directory path return_path: bool If True, path to each file, relative to the top directory. If false, only return the filenames search_subdirs: bool If True, search all subdirectories search_path: bool Search for substring occurrence in relative path rather than just the filename Returns: : list (str) Alphabetically sorted list of file names Examples: >>> from ketos.data_handling.data_handling import find_wave_files >>> >>> find_wave_files(path="ketos/tests/assets", return_path=False) ['2min.wav', 'empty.wav', 'grunt1.wav', 'super_short_1.wav', 'super_short_2.wav'] """ return find_files(path, substr=['.wav', '.WAV'], return_path=return_path, search_subdirs=search_subdirs, search_path=search_path)
[docs]def read_wave(file, channel=0, start=0, stop=None): """ Read a wave file in either mono or stereo mode. Wrapper method around https://pysoundfile.readthedocs.io/en/latest/index.html#soundfile.read Args: file: str path to the wave file channel: int Which channel should be used in case of stereo data (0: left, 1: right) start: int (optional) Where to start reading. A negative value counts from the end. Defaults to 0. stop: int (optional) The index after the last time step to be read. A negative value counts from the end. Returns: (rate,data) rate: int The sampling rate data: numpy.array (float) A 1d array containing the audio data Examples: >>> from ketos.data_handling.data_handling import read_wave >>> rate, data = read_wave("ketos/tests/assets/2min.wav") >>> # the function returns the sampling rate (in Hz) as an integer >>> type(rate) <class 'int'> >>> rate 2000 >>> # And the actual audio data is a numpy array >>> type(data) <class 'numpy.ndarray'> >>> len(data) 241664 >>> # Since each item in the vector is one sample, >>> # The duration of the audio in seconds can be obtained by >>> # dividing the the vector length by the sampling rate >>> len(data)/rate 120.832 """ signal, rate = sf.read(file=file, start=start, stop=stop, always_2d=True) data = signal[:, channel] data = np.asfortranarray(data) return rate, data
[docs]def create_dir(dir): """ Create a new directory if it does not exist Will also create any intermediate directories that do not exist Args: dir: str The path to the new directory """ os.makedirs(dir, exist_ok=True)
[docs]def to1hot(value,depth): """Converts the binary label to one hot format Args: value: scalar or numpy.array | int or float The the label to be converted. depth: int The number of possible values for the labels (number of categories). Returns: one_hot:numpy array (dtype=float64) A len(value) by depth array containg the one hot encoding for the given value(s). Example: >>> from ketos.data_handling.data_handling import to1hot >>> >>> # An example with two possible labels (0 or 1) >>> values = np.array([0,1]) >>> to1hot(values,depth=2) array([[1., 0.], [0., 1.]]) >>> >>> # The same example with 4 possible labels (0,1,2 or 3) >>> values = np.array([0,1]) >>> to1hot(values,depth=4) array([[1., 0., 0., 0.], [0., 1., 0., 0.]]) """ value = np.int64(value) one_hot = np.eye(depth)[value] return one_hot
[docs]def from1hot(value): """Converts the one hot label to binary format Args: value: scalar or numpy.array | int or float The label to be converted. Returns: output: int or numpy array (dtype=int64) An int representing the category if 'value' has 1 dimension or an array of m ints if values is an n by m array. Example: >>> from ketos.data_handling.data_handling import from1hot >>> >>> from1hot(np.array([0,0,0,1,0])) 3 >>> from1hot(np.array([[0,0,0,1,0], ... [0,1,0,0,0]])) array([3, 1]) """ if value.ndim > 1: output = np.apply_along_axis(arr=value, axis=1, func1d=np.argmax) output.dtype = np.int64 else: output = np.argmax(value) return output
[docs]def check_data_sanity(images, labels): """ Check that all images have same size, all labels have values, and number of images and labels match. Args: images: numpy array or pandas series Images labels: numpy array or pandas series Labels Raises: ValueError: If no images or labels are passed; If the number of images and labels is different; If images have different shapes; If any labels are NaN. Returns: True if all checks pass. Examples: >>> from ketos.data_handling.data_handling import check_data_sanity >>> # Load a database with images and integer labels >>> data = pd.read_pickle("ketos/tests/assets/pd_img_db.pickle") >>> images = data['image'] >>> labels = data['label'] >>> # When all the images and labels pass all the quality checks, >>> # The function returns True >>> check_data_sanity(images, labels) True >>> # If something is wrong, like if the number of labels >>> # is different from the number of images, and exeption is raised >>> labels = data['label'][:10] >>> check_data_sanity(images, labels=labels) Traceback (most recent call last): File "/usr/lib/python3.6/doctest.py", line 1330, in __run compileflags, 1), test.globs) File "<doctest data_handling.check_data_sanity[5]>", line 1, in <module> check_data_sanity(images, labels=labels) File "ketos/data_handling/data_handling.py", line 599, in check_data_sanity raise ValueError("Image and label columns have different lengths") ValueError: Image and label columns have different lengths """ checks = True if images is None or labels is None: raise ValueError(" Images and labels cannot be None") # check that number of images matches numbers of labels if len(images) != len(labels): raise ValueError("Image and label columns have different lengths") # determine image size and check that all images have same size image_shape = images[0].shape if not all(x.shape == image_shape for x in images): raise ValueError("Images do not all have the same size") # check that all labels have values b = np.isnan(labels) n = np.count_nonzero(b) if n != 0: raise ValueError("Some labels are NaN") return checks