Source code for ava.data.data_container

"""
DataContainer class for linking directories containing different sorts of data.

This is meant to make plotting and analysis easier.

TO DO
-----
- request random subsets.
- make sure input directories are iterable
- add features to existing files.
"""
__date__ = "July 2019 - November 2020"


import h5py
try: # Numba >= 0.52
	from numba.core.errors import NumbaPerformanceWarning
except ModuleNotFoundError:
	try: # Numba <= 0.45
		from numba.errors import NumbaPerformanceWarning
	except (NameError, ModuleNotFoundError):
		pass
import numpy as np
import os
from scipy.io import wavfile
from scipy.io.wavfile import WavFileWarning
from sklearn.decomposition import PCA
from time import strptime, mktime
import torch
import umap
import warnings

from ava.models.vae import VAE
from ava.models.vae_dataset import get_syllable_partition, \
		get_syllable_data_loaders
from ava.models.utils import get_hdf5s_from_dir


AUDIO_FIELDS = ['audio']
FILENAME_FIELDS = ['sap_time']
SEGMENT_FIELDS = ['segments', 'segment_audio']
PROJECTION_FIELDS = ['latent_means', 'latent_mean_pca', 'latent_mean_umap']
SPEC_FIELDS = ['specs', 'onsets', 'offsets', 'audio_filenames']
MUPET_FIELDS = ['syllable_number', 'syllable_start_time', 'syllable_end_time',
	'inter-syllable_interval', 'syllable_duration', 'starting_frequency',
	'final_frequency', 'minimum_frequency', 'maximum_frequency',
	'mean_frequency', 'frequency_bandwidth', 'total_syllable_energy',
	'peak_syllable_amplitude', 'cluster']
DEEPSQUEAK_FIELDS = ['id', 'label', 'accepted', 'score', 'begin_time',
	'end_time', 'call_length', 'principal_frequency', 'low_freq', 'high_freq',
	'delta_freq', 'frequency_standard_deviation', 'slope', 'sinuosity',
	'mean_power', 'tonality']
SAP_FIELDS = ['syllable_duration_sap', 'syllable_start', 'mean_amplitude',
	'mean_pitch', 'mean_FM', 'mean_AM2', 'mean_entropy', 'mean_pitch_goodness',
	'mean_mean_freq', 'pitch_variance', 'FM_variance', 'entropy_variance',
	'pitch_goodness_variance', 'mean_freq_variance', 'AM_variance']
ALL_FIELDS = AUDIO_FIELDS + FILENAME_FIELDS + SEGMENT_FIELDS + \
	PROJECTION_FIELDS + SPEC_FIELDS + MUPET_FIELDS + DEEPSQUEAK_FIELDS + \
	SAP_FIELDS
"""All fields that can be requested by a DataContainer object."""

MUPET_ONSET_COL = MUPET_FIELDS.index('syllable_start_time')
DEEPSQUEAK_ONSET_COL = DEEPSQUEAK_FIELDS.index('begin_time')
SAP_ONSET_COL = SAP_FIELDS.index('syllable_start')
PRETTY_NAMES = {
	'audio': 'Audio',
	'segments': 'Segments',
	'segment_audio': 'Segment Audio',
	'latent_means': 'Latent Means',
	'latent_mean_pca': 'Latent Mean PCA Projection',
	'latent_mean_umap': 'Latent Mean UMAP Projection',
	'specs': 'Spectrograms',
	'onsets': 'Onsets (s)',
	'offsets': 'Offsets (s)',
	'aduio_filenames': 'Filenames',
	'syllable_number': 'Syllable Number',
	'syllable_start_time': 'Onsets (s)',
	'syllable_duration': 'Duration (ms)',
	'starting_frequency': 'Starting Freq. (kHz)',
	'final_frequency': 'Final Freq. (kHz)',
	'minimum_frequency': 'Min Freq. (kHz)',
	'maximum_frequency': 'Max Freq. (kHz)',
	'mean_frequency': 'Mean Freq. (kHz)',
	'frequency_bandwidth': 'Freq. Bandwidth (kHz)',
	'total_syllable_energy': 'Total Energy (dB)',
	'peak_syllable_amplitude': 'Peak Amplitude (dB)',
	'cluster': 'Cluster',
	'id': 'Syllabler Number',
	'label': 'Label',
	'accepted': 'Accepted',
	'score': 'DeepSqueak Detection Score',
	'begin_time': 'Onsets (s)',
	'end_time': 'Offsets (s)',
	'call_length': 'Duration (ms)',
	'principal_frequency': 'Principal Freq. (kHz)',
	'low_freq': 'Minimum Freq. (kHz)',
	'high_freq': 'Max Freq. (kHz)',
	'delta_freq': 'Freq. Bandwidth (kHz)',
	'frequency_standard_deviation': 'Freq Std. Dev. (kHz)',
	'slope': 'Freq. Mod. (kHz/s)',
	'sinuosity': 'Sinuosity',
	'mean_power': 'Power (dB/Hz)',
	'tonality': 'Tonality',
	'syllable_duration_sap': 'Duration (s)',
	'syllable_start': 'Onset (s)',
	'mean_amplitude': 'Amplitude',
	'mean_pitch': 'Pitch',
	'mean_FM': 'Freq. Mod.',
	'mean_AM2': 'Amp. Mod.',
	'mean_entropy': 'Entropy',
	'mean_pitch_goodness': 'Goodness of Pitch',
	'mean_mean_freq': 'Mean Frequency',
	'pitch_variance': 'Pitch Variance',
	'FM_variance': 'Freq. Mod. Var.',
	'entropy_variance': 'Entropy Var.',
	'pitch_goodness_variance': 'Goodness of Pitch Var.',
	'mean_freq_variance': 'Freq. Var.',
	'AM_variance': 'Amp. Mod. Var.',
}
PRETTY_NAMES_NO_UNITS = {}
for k in PRETTY_NAMES:
	PRETTY_NAMES_NO_UNITS[k] = ' '.join(PRETTY_NAMES[k].split('(')[0].split(' '))



[docs]class DataContainer(): """ Link directories containing different data sources for easy plotting. The idea here is for plotting and analysis tools to accept a DataContainer, from which they can request different types of data. Those requests can then be handled here in a central location, which can cut down on redundant code and processing steps. Attributes ---------- audio_dirs : {list of str, None}, optional Directories containing audio. Defaults to None. segment_dirs : {list of str, None}, optional Directories containing segmenting decisions. spec_dirs : list of {str, None}, optional Directories containing hdf5 files of spectrograms. These should be files output by ava.preprocessing.preprocessing. Defaults to None. model_filename : {str, None}, optional The VAE checkpoint to load. Written by models.vae.save_state. Defaults to None. projection_dirs : list of {str, None}, optional Directory containing different projections. This is where things like latent means, their projections, and handcrafted features found in feature_dirs are saved. Defaults to None. plots_dir : str, optional Directory to save plots. Defaults to '' (current working directory). feature_dirs : list of {str, None}, optional Directory containing text files with different syllable features. For exmaple, this could contain exported MUPET, DeepSqueak or SAP syllable tables. Defaults to None. template_dir : {str, None}, optional Directory continaing audio files of song templates. Defaults to None. Methods ------- request(field) Request some type of data. Notes ----- Supported directory structure: :: ├── animal_1 │ ├── audio (raw audio) │ │ ├── foo.wav │ │ ├── bar.wav │ │ └── baz.wav │ ├── features (output of MUPET, DeepSqueak, SAP, ...) │ │ ├── foo.csv │ │ ├── bar.csv │ │ └── baz.csv │ ├── spectrograms (used to train models, written by │ │ ├── syllables_000.hdf5 preprocessing.preprocess.process_sylls) │ │ └── syllables_001.hdf5 │ └── projections (latent means, UMAP, PCA, tSNE │ ├── syllables_000.hdf5 projections, copies of features in │ └── syllables_001.hdf5 experiment_1/features. These are │ written by a DataContainer object.) ├── animal_2 │ ├── audio │ │ ├── 1.wav │ │ └── 2.wav │ ├── features │ │ ├── 1.csv │ │ └── 2.csv │ ├── spectrograms │ │ ├── syllables_000.hdf5 │ │ └── syllables_001.hdf5 │ └── projections │ ├── syllables_000.hdf5 │ └── syllables_001.hdf5 . . . There should be a 1-to-1 correspondence between, for example, the syllables in `animal_1/audio/baz.wav` and the features described in `animal_1/features/baz.csv`. Analogously, the fifth entry in `animal_2/spectrograms/syllables_000.hdf5` should describe the same syllable as the fifth entry in `animal_2/projections/syllables_000.hdf5`. There is no strict relationship, however, between individual files in `animal_1/audio` and `animal_1/spectrograms`. The hdf5 files in the spectrograms and projections directories should contain a subset of the syllables in the audio and features directories. Then a DataContainer object can be initialized as: >>> from ava.data.data_container import DataContainer >>> audio_dirs = ['animal_1/audio', 'animal_2/audio'] >>> spec_dirs = ['animal_1/spectrograms', 'animal_2/spectrograms'] >>> model_filename = 'checkpoint.tar' >>> dc = DataContainer(audio_dirs=audio_dirs, spec_dirs=spec_dirs, \ model_filename=model_filename) >>> latent_means = dc.request('latent_means') It's fine to leave some of the initialization parameters unspecified. If the DataContainer object is asked to do something it can't, it will hopefully complain politely. Or at least informatively. """ def __init__(self, audio_dirs=None, segment_dirs=None, spec_dirs=None, \ feature_dirs=None, projection_dirs=None, plots_dir='', \ model_filename=None, template_dir=None, verbose=True): self.audio_dirs = audio_dirs self.segment_dirs = segment_dirs self.spec_dirs = spec_dirs self.feature_dirs = feature_dirs self.projection_dirs = projection_dirs self.plots_dir = plots_dir self.model_filename = model_filename self.template_dir = template_dir self.verbose = verbose self.sylls_per_file = None # syllables in each hdf5 file in spec_dirs self.fields = self._check_for_fields() if self.plots_dir not in [None, ''] and \ not os.path.exists(self.plots_dir): os.makedirs(self.plots_dir)
[docs] def request(self, field): """ Request some type of data. Parameters ---------- field : str The type of data being requested. Should come from ... Raises ------ `NotImplementedError` when `field` is not recognized. Note ---- Besides `__init__` and `clear_projections`, this should be the only external-facing method. """ if field not in ALL_FIELDS: print(str(field) + " is not a valid field!") raise NotImplementedError # If it's not here, make it and return it. if field not in self.fields: if self.verbose: print("Making field:", field) data = self._make_field(field) # Otherwise, read it and return it. else: if self.verbose: print("Reading field:", field) data = self._read_field(field) if self.verbose: print("\tDone with:", field) return data
[docs] def clear_projections(self): """ Remove all projections. This deletes all the ``.hdf5`` files in ``self.projection_dirs``. """ for proj_dir in self.projection_dirs: if not os.path.exists(proj_dir): continue fns = [os.path.join(proj_dir, i) for i in os.listdir(proj_dir)] fns = [i for i in fns if len(i) > 5 and i[-5:] == '.hdf5'] for fn in fns: os.remove(fn) self.fields = self._check_for_fields()
def _make_field(self, field): """Make a field.""" if field == 'latent_means': data = self._make_latent_means() elif field == 'latent_mean_pca': data = self._make_latent_mean_pca_projection() elif field == 'latent_mean_umap': data = self._make_latent_mean_umap_projection() elif field in MUPET_FIELDS: data = self._make_feature_field(field, kind='mupet') elif field in DEEPSQUEAK_FIELDS: data = self._make_feature_field(field, kind='deepsqueak') elif field in SAP_FIELDS: data = self._make_feature_field(field, kind='sap') elif field in FILENAME_FIELDS: data = self._read_filename_field(field) elif field == 'specs': raise NotImplementedError else: raise NotImplementedError # Add this field to the collection of fields that have been computed. self.fields[field] = 1 if self.verbose: print("Making field:", field) return data def _read_field(self, field): """ Read a field from memory. Parameters ---------- field : str Field name to read from file. See ``ALL_FIELDS`` for possible fields. """ if field in AUDIO_FIELDS: raise NotImplementedError elif field == 'segments': return self._read_segments() elif field == 'segment_audio': return self._read_segment_audio() elif field in PROJECTION_FIELDS: load_dirs = self.projection_dirs elif field in SPEC_FIELDS: load_dirs = self.spec_dirs elif field in MUPET_FIELDS: load_dirs = self.projection_dirs elif field in DEEPSQUEAK_FIELDS: load_dirs = self.projection_dirs elif field in SAP_FIELDS: load_dirs = self.projection_dirs else: raise Exception("Can\'t read field: "+field+"\n This should have \ been caught in self.request!") to_return = [] for i in range(len(self.spec_dirs)): spec_dir, load_dir = self.spec_dirs[i], load_dirs[i] hdf5s = get_hdf5s_from_dir(spec_dir) for j, hdf5 in enumerate(hdf5s): filename = os.path.join(load_dir, os.path.split(hdf5)[-1]) with h5py.File(filename, 'r') as f: assert field in f, "Can\'t find field \'"+field+"\' in"+ \ " file \'"+filename+"\'!" if field == 'audio_filenames': data = np.array([k.decode('UTF-8') for k in f[field]]) to_return.append(data) else: to_return.append(np.array(f[field])) return np.concatenate(to_return) def _read_segment_audio(self): """ Read all the segmented audio and return it. result[audio_dir][audio_filename] = [audio_1, audio_2, ..., audio_n] """ self._check_for_dirs(['audio_dirs'], 'audio') segments = self.request('segments') result = {} for audio_dir in self.audio_dirs: dir_result = {} audio_fns = [i for i in os.listdir(audio_dir) if _is_wav_file(i) \ and i in segments[audio_dir]] for audio_fn in audio_fns: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=WavFileWarning) fs, audio = wavfile.read(os.path.join(audio_dir, audio_fn)) fn_result = [] for seg in segments[audio_dir][audio_fn]: i1 = int(round(seg[0]*fs)) i2 = int(round(seg[1]*fs)) fn_result.append(audio[i1:i2]) dir_result[audio_fn] = fn_result result[audio_dir] = dir_result return result def _read_segments(self): """ Return all the segmenting decisions. Return a dictionary mapping audio directories to audio filenames to numpy arrays of shape [num_segments,2] containing onset and offset times. TO DO: add support for other delimiters, file extstensions, etc. Returns ------- segments : dict Maps audio directories to audio filenames to numpy arrays. """ self._check_for_dirs(['audio_dirs', 'segment_dirs'], 'segments') result = {} for audio_dir, seg_dir in zip(self.audio_dirs, self.segment_dirs): dir_result = {} seg_fns = [os.path.join(seg_dir, i) for i in os.listdir(seg_dir) \ if _is_seg_file(i)] audio_fns = [os.path.split(i)[1][:-4]+'.wav' for i in seg_fns] for audio_fn, seg_fn in zip(audio_fns, seg_fns): segs = _read_columns(seg_fn, delimiter='\t', unpack=False, \ skiprows=0) if len(segs) > 0: dir_result[audio_fn] = segs result[audio_dir] = dir_result return result def _make_latent_means(self): """ Write latent means for the syllables in self.spec_dirs. Returns ------- latent_means : numpy.ndarray Latent means of shape (max_num_syllables, z_dim) Note ---- * Duplicated code with ``_write_projection``? """ self._check_for_dirs(['projection_dirs', 'spec_dirs', 'model_filename'],\ 'latent_means') # First, see how many syllables are in each file. temp = get_hdf5s_from_dir(self.spec_dirs[0]) assert len(temp) > 0, "Found no specs in" + self.spec_dirs[0] hdf5_file = temp[0] with h5py.File(hdf5_file, 'r') as f: self.sylls_per_file = len(f['specs']) spf = self.sylls_per_file # Load the model, making sure to get z_dim correct. map_loc = 'cuda' if torch.cuda.is_available() else 'cpu' z_dim = torch.load(self.model_filename, map_location=map_loc)['z_dim'] model = VAE(z_dim=z_dim) model.load_state(self.model_filename) # For each directory... all_latent = [] for i in range(len(self.spec_dirs)): spec_dir, proj_dir = self.spec_dirs[i], self.projection_dirs[i] # Make the projection directory if it doesn't exist. if proj_dir != '' and not os.path.exists(proj_dir): os.makedirs(proj_dir) # Make a DataLoader for the syllables. partition = get_syllable_partition([spec_dir], 1, shuffle=False) try: loader = get_syllable_data_loaders(partition, \ shuffle=(False,False))['train'] # Get the latent means from the model. latent_means = model.get_latent(loader) all_latent.append(latent_means) # Write them to the corresponding projection directory. hdf5s = get_hdf5s_from_dir(spec_dir) assert len(latent_means) // len(hdf5s) == spf for j in range(len(hdf5s)): filename = os.path.join(proj_dir, os.path.split(hdf5s[j])[-1]) data = latent_means[j*spf:(j+1)*spf] with h5py.File(filename, 'a') as f: f.create_dataset('latent_means', data=data) except AssertionError: # No specs in this directory pass return np.concatenate(all_latent) def _read_filename_field(self, field): if field == 'sap_time': data = self._make_sap_time() else: raise NotImplementedError return data def _make_sap_time(self): """Return time in seconds, following SAP conventions.""" onsets = self.request('syllable_start') fns = self.request('audio_filenames') result = np.zeros(lemn(onsets)) for i, onset, fn in zip(range(len(onsets)), onsets, fns): # December 29, 1899, 7pm is the SAP anchor time. anchor = mktime(strptime("1899 12 29 19", "%Y %m %d %H")) temp = os.path.split(fn)[-1].split('_')[1].split('.') day = float(temp[0]) millisecond = float(temp[1]) time = anchor + 24*60*60*day + 1e-3*millisecond result[i] = time + onset return result def _make_latent_mean_umap_projection(self): """Project latent means to two dimensions with UMAP.""" # Get latent means. latent_means = self.request('latent_means') # UMAP them. transform = umap.UMAP(n_components=2, n_neighbors=20, min_dist=0.1, \ metric='euclidean', random_state=42) if self.verbose: print("Running UMAP... (n="+str(len(latent_means))+")") # https://github.com/lmcinnes/umap/issues/252 with warnings.catch_warnings(): try: warnings.filterwarnings("ignore", \ category=NumbaPerformanceWarning) except NameError: pass embedding = transform.fit_transform(latent_means) if self.verbose: print("\tDone.") # Write to files. self._write_projection("latent_mean_umap", embedding) return embedding def _make_latent_mean_pca_projection(self): """Project latent means to two dimensions with PCA.""" # Get latent means. latent_means = self.request('latent_means') # UMAP them. transform = PCA(n_components=2, copy=False, random_state=42) if self.verbose: print("Running PCA...") embedding = transform.fit_transform(latent_means) if self.verbose: print("\tDone.") # Write to files. self._write_projection("latent_mean_pca", embedding) return embedding def _make_feature_field(self, field, kind): """ Read a feature from a text file and put it in an hdf5 file. Read from self.feature_dirs and write to self.projection_dirs. This could be a bit tricky because we need to match up the syllables in the text file with the ones in the hdf5 file. Parameters ---------- field : str Name of data being requested. See ``ALL_FIELDS`` for a complete list. kind : str, 'mupet' or 'deepsqueak' Is this a MUPET or a DeepSqueak field? Returns ------- data : numpy.ndarray Requested data. """ self._check_for_dirs( \ ['spec_dirs', 'feature_dirs', 'projection_dirs'], field) # Find which column the field is stored in. if kind == 'mupet': file_fields = MUPET_FIELDS onset_col = MUPET_ONSET_COL elif kind == 'deepsqueak': file_fields = DEEPSQUEAK_FIELDS onset_col = DEEPSQUEAK_ONSET_COL elif kind == 'sap': file_fields = SAP_FIELDS onset_col = SAP_ONSET_COL else: assert NotImplementedError field_col = file_fields.index(field) to_return = [] # Run through each directory. for i in range(len(self.spec_dirs)): spec_dir = self.spec_dirs[i] feature_dir = self.feature_dirs[i] proj_dir = self.projection_dirs[i] hdf5s = get_hdf5s_from_dir(spec_dir) current_fn, k = None, None for hdf5 in hdf5s: # Get the filenames and onsets from self.spec_dirs. with h5py.File(hdf5, 'r') as f: audio_filenames = np.array(f['audio_filenames']) spec_onsets = np.array(f['onsets']) # if kind == 'sap': # SAP writes onsets in milliseconds. # spec_onsets /= 1e3 feature_arr = np.zeros(len(spec_onsets)) # Loop through each syllable. for j in range(len(spec_onsets)): audio_fn, spec_onset = audio_filenames[j], spec_onsets[j] audio_fn = audio_fn.decode('UTF-8') # Update the feature file, if needed. if audio_fn != current_fn: current_fn = audio_fn feature_fn = os.path.split(audio_fn)[-1][:-4] if kind == 'deepsqueak': # DeepSqueak appends '_Stats' feature_fn += '_Stats' # when exporting features. feature_fn += '.csv' feature_fn = os.path.join(feature_dir, feature_fn) # Read the onsets and features. feature_onsets, features = \ _read_columns(feature_fn, [onset_col, field_col]) if kind == 'sap': # SAP writes onsets in milliseconds. feature_onsets /= 1e3 k = 0 # Look for the corresponding onset in the feature file. while spec_onset > feature_onsets[k] + 0.01: k += 1 assert k < len(feature_onsets) if abs(spec_onset - feature_onsets[k]) > 0.01: print("Mismatch between spec_dirs and feature_dirs!") print("hdf5 file:", hdf5) print("\tindex:", j) print("audio filename:", audio_fn) print("feature filename:", feature_fn) print("Didn't find spec_onset", spec_onset) print("in feature onsets of min:", \ np.min(feature_onsets), "max:", \ np.max(feature_onsets)) print("field:", field) print("kind:", kind) quit() # And add it to the feature array. feature_arr[j] = features[k] # Write the fields to self.projection_dirs. write_fn = os.path.join(proj_dir, os.path.split(hdf5)[-1]) with h5py.File(write_fn, 'a') as f: f.create_dataset(field, data=feature_arr) to_return.append(feature_arr) self.fields[field] = 1 return np.concatenate(to_return) def _write_projection(self, key, data): """Write the given projection to self.projection_dirs.""" sylls_per_file = self.sylls_per_file # For each directory... k = 0 for i in range(len(self.projection_dirs)): spec_dir, proj_dir = self.spec_dirs[i], self.projection_dirs[i] hdf5s = get_hdf5s_from_dir(spec_dir) for j in range(len(hdf5s)): filename = os.path.join(proj_dir, os.path.split(hdf5s[j])[-1]) to_write = data[k:k+sylls_per_file] with h5py.File(filename, 'a') as f: f.create_dataset(key, data=to_write) k += sylls_per_file def _check_for_fields(self): """Check to see which fields are saved.""" fields = {} # If self.spec_dirs is registered, assume everything is there. if self.spec_dirs is not None: for field in SPEC_FIELDS: fields[field] = 1 # Same for self.audio_dirs. if self.audio_dirs is not None: fields['audio'] = 1 # Same for self.segment_dirs. if self.segment_dirs is not None: fields['segments'] = 1 fields['segment_audio'] = 1 # If self.projection_dirs is registered, see what we have. # If it's in one file, assume it's in all of them. if self.projection_dirs is not None: if os.path.exists(self.projection_dirs[0]): hdf5s = get_hdf5s_from_dir(self.projection_dirs[0]) if len(hdf5s) > 0: hdf5 = hdf5s[0] if os.path.exists(hdf5): with h5py.File(hdf5, 'r') as f: for key in f.keys(): if key in ALL_FIELDS: fields[key] = 1 self.sylls_per_file = len(f[key]) return fields def _check_for_dirs(self, dir_names, field): """Check that the given directories exist.""" for dir_name in dir_names: if dir_name == 'audio_dirs': temp = self.audio_dirs elif dir_name == 'segment_dirs': temp = self.segment_dirs elif dir_name == 'spec_dirs': temp = self.spec_dirs elif dir_name == 'feature_dirs': temp = self.feature_dirs elif dir_name == 'projection_dirs': temp = self.projection_dirs elif dir_name == 'model_filename': temp = self.model_filename else: raise NotImplementedError assert temp is not None, dir_name + " must be specified before " + \ field + " is made!"
def _read_columns(filename, columns=(0,1), delimiter=',', skiprows=1, \ unpack=True): """ A wrapper around numpy.loadtxt to handle empty files. TO DO: Add categorical variables. """ data = np.loadtxt(filename, delimiter=delimiter, usecols=columns, \ skiprows=skiprows).reshape(-1,len(columns)) if unpack: return tuple(data[:,i] for i in range(data.shape[1])) return data def _is_seg_file(filename): """Is this a segmenting file?""" return len(filename) > 4 and filename[-4:] == '.txt' def _is_wav_file(filename): """Is this a wav file?""" return len(filename) > 4 and filename[-4:] == '.wav' if __name__ == '__main__': pass ###