Source code for ava.data.data_container

"""
DataContainer class for linking directories containing different sorts of data.

This is meant to make plotting and analysis easier.

TO DO
-----
- request random subsets.
- make sure input directories are iterable
- add features to existing files.
"""
__date__ = "July 2019 - November 2020"


import h5py
try: # Numba >= 0.52
	from numba.core.errors import NumbaPerformanceWarning
except ModuleNotFoundError:
	try: # Numba <= 0.45
		from numba.errors import NumbaPerformanceWarning
	except (NameError, ModuleNotFoundError):
		pass
import numpy as np
import os
from scipy.io import wavfile
from scipy.io.wavfile import WavFileWarning
from sklearn.decomposition import PCA
from time import strptime, mktime
import torch
import umap
import warnings

from ava.models.vae import VAE
from ava.models.vae_dataset import get_syllable_partition, \
		get_syllable_data_loaders
from ava.models.utils import get_hdf5s_from_dir


AUDIO_FIELDS = ['audio']
FILENAME_FIELDS = ['sap_time']
SEGMENT_FIELDS = ['segments', 'segment_audio']
PROJECTION_FIELDS = ['latent_means', 'latent_mean_pca', 'latent_mean_umap']
SPEC_FIELDS = ['specs', 'onsets', 'offsets', 'audio_filenames']
MUPET_FIELDS = ['syllable_number', 'syllable_start_time', 'syllable_end_time',
	'inter-syllable_interval', 'syllable_duration', 'starting_frequency',
	'final_frequency', 'minimum_frequency', 'maximum_frequency',
	'mean_frequency', 'frequency_bandwidth', 'total_syllable_energy',
	'peak_syllable_amplitude', 'cluster']
DEEPSQUEAK_FIELDS = ['id', 'label', 'accepted', 'score', 'begin_time',
	'end_time', 'call_length', 'principal_frequency', 'low_freq', 'high_freq',
	'delta_freq', 'frequency_standard_deviation', 'slope', 'sinuosity',
	'mean_power', 'tonality']
SAP_FIELDS = ['syllable_duration_sap', 'syllable_start', 'mean_amplitude',
	'mean_pitch', 'mean_FM', 'mean_AM2', 'mean_entropy', 'mean_pitch_goodness',
	'mean_mean_freq', 'pitch_variance', 'FM_variance', 'entropy_variance',
	'pitch_goodness_variance', 'mean_freq_variance', 'AM_variance']
ALL_FIELDS = AUDIO_FIELDS + FILENAME_FIELDS + SEGMENT_FIELDS + \
	PROJECTION_FIELDS + SPEC_FIELDS + MUPET_FIELDS + DEEPSQUEAK_FIELDS + \
	SAP_FIELDS
"""All fields that can be requested by a DataContainer object."""

MUPET_ONSET_COL = MUPET_FIELDS.index('syllable_start_time')
DEEPSQUEAK_ONSET_COL = DEEPSQUEAK_FIELDS.index('begin_time')
SAP_ONSET_COL = SAP_FIELDS.index('syllable_start')
PRETTY_NAMES = {
	'audio': 'Audio',
	'segments': 'Segments',
	'segment_audio': 'Segment Audio',
	'latent_means': 'Latent Means',
	'latent_mean_pca': 'Latent Mean PCA Projection',
	'latent_mean_umap': 'Latent Mean UMAP Projection',
	'specs': 'Spectrograms',
	'onsets': 'Onsets (s)',
	'offsets': 'Offsets (s)',
	'aduio_filenames': 'Filenames',
	'syllable_number': 'Syllable Number',
	'syllable_start_time': 'Onsets (s)',
	'syllable_duration': 'Duration (ms)',
	'starting_frequency': 'Starting Freq. (kHz)',
	'final_frequency': 'Final Freq. (kHz)',
	'minimum_frequency': 'Min Freq. (kHz)',
	'maximum_frequency': 'Max Freq. (kHz)',
	'mean_frequency': 'Mean Freq. (kHz)',
	'frequency_bandwidth': 'Freq. Bandwidth (kHz)',
	'total_syllable_energy': 'Total Energy (dB)',
	'peak_syllable_amplitude': 'Peak Amplitude (dB)',
	'cluster': 'Cluster',
	'id': 'Syllabler Number',
	'label': 'Label',
	'accepted': 'Accepted',
	'score': 'DeepSqueak Detection Score',
	'begin_time': 'Onsets (s)',
	'end_time': 'Offsets (s)',
	'call_length': 'Duration (ms)',
	'principal_frequency': 'Principal Freq. (kHz)',
	'low_freq': 'Minimum Freq. (kHz)',
	'high_freq': 'Max Freq. (kHz)',
	'delta_freq': 'Freq. Bandwidth (kHz)',
	'frequency_standard_deviation': 'Freq Std. Dev. (kHz)',
	'slope': 'Freq. Mod. (kHz/s)',
	'sinuosity': 'Sinuosity',
	'mean_power': 'Power (dB/Hz)',
	'tonality': 'Tonality',
	'syllable_duration_sap': 'Duration (s)',
	'syllable_start': 'Onset (s)',
	'mean_amplitude': 'Amplitude',
	'mean_pitch': 'Pitch',
	'mean_FM': 'Freq. Mod.',
	'mean_AM2': 'Amp. Mod.',
	'mean_entropy': 'Entropy',
	'mean_pitch_goodness': 'Goodness of Pitch',
	'mean_mean_freq': 'Mean Frequency',
	'pitch_variance': 'Pitch Variance',
	'FM_variance': 'Freq. Mod. Var.',
	'entropy_variance': 'Entropy Var.',
	'pitch_goodness_variance': 'Goodness of Pitch Var.',
	'mean_freq_variance': 'Freq. Var.',
	'AM_variance': 'Amp. Mod. Var.',
}
PRETTY_NAMES_NO_UNITS = {}
for k in PRETTY_NAMES:
	PRETTY_NAMES_NO_UNITS[k] = ' '.join(PRETTY_NAMES[k].split('(')[0].split(' '))



[docs]class DataContainer():
	"""
	Link directories containing different data sources for easy plotting.

	The idea here is for plotting and analysis tools to accept a DataContainer,
	from which they can request different types of data. Those requests can then
	be handled here in a central location, which can cut down on redundant code
	and processing steps.

	Attributes
	----------
	audio_dirs : {list of str, None}, optional
		Directories containing audio. Defaults to None.
	segment_dirs : {list of str, None}, optional
		Directories containing segmenting decisions.
	spec_dirs : list of {str, None}, optional
		Directories containing hdf5 files of spectrograms. These should be
		files output by ava.preprocessing.preprocessing. Defaults to None.
	model_filename : {str, None}, optional
		The VAE checkpoint to load. Written by models.vae.save_state.
		Defaults to None.
	projection_dirs : list of {str, None}, optional
		Directory containing different projections. This is where things
		like latent means, their projections, and handcrafted features
		found in feature_dirs are saved. Defaults to None.
	plots_dir : str, optional
		Directory to save plots. Defaults to '' (current working directory).
	feature_dirs : list of {str, None}, optional
		Directory containing text files with different syllable features.
		For exmaple, this could contain exported MUPET, DeepSqueak or SAP
		syllable tables. Defaults to None.
	template_dir : {str, None}, optional
		Directory continaing audio files of song templates. Defaults to
		None.

	Methods
	-------
	request(field)
		Request some type of data.

	Notes
	-----

	Supported directory structure:

	::

		├── animal_1
		│   ├── audio                     (raw audio)
		│   │   ├── foo.wav
		│   │   ├── bar.wav
		│   │   └── baz.wav
		│   ├── features                 (output of MUPET, DeepSqueak, SAP, ...)
		│   │   ├── foo.csv
		│   │   ├── bar.csv
		│   │   └── baz.csv
		│   ├── spectrograms             (used to train models, written by
		│   │   ├── syllables_000.hdf5   preprocessing.preprocess.process_sylls)
		│   │   └── syllables_001.hdf5
		│   └── projections              (latent means, UMAP, PCA, tSNE
		│      ├── syllables_000.hdf5    projections, copies of features in
		│      └── syllables_001.hdf5    experiment_1/features. These are
		│                                written by a DataContainer object.)
		├── animal_2
		│   ├── audio
		│   │   ├── 1.wav
		│   │   └── 2.wav
		│   ├── features
		│   │   ├── 1.csv
		│   │   └── 2.csv
		│   ├── spectrograms
		│   │   ├── syllables_000.hdf5
		│   │   └── syllables_001.hdf5
		│   └── projections
		│       ├── syllables_000.hdf5
		│       └── syllables_001.hdf5
		.
		.
		.


	There should be a 1-to-1 correspondence between, for example, the syllables
	in `animal_1/audio/baz.wav` and the features described in
	`animal_1/features/baz.csv`. Analogously, the fifth entry in
	`animal_2/spectrograms/syllables_000.hdf5` should describe the same syllable
	as the fifth entry in `animal_2/projections/syllables_000.hdf5`. There is no
	strict relationship, however, between individual files in `animal_1/audio`
	and `animal_1/spectrograms`. The hdf5 files in the spectrograms and
	projections directories should contain a subset of the syllables in the
	audio and features directories.

	Then a DataContainer object can be initialized as:

	>>> from ava.data.data_container import DataContainer
	>>> audio_dirs = ['animal_1/audio', 'animal_2/audio']
	>>> spec_dirs = ['animal_1/spectrograms', 'animal_2/spectrograms']
	>>> model_filename = 'checkpoint.tar'
	>>> dc = DataContainer(audio_dirs=audio_dirs, spec_dirs=spec_dirs, \
	model_filename=model_filename)
	>>> latent_means = dc.request('latent_means')

	It's fine to leave some of the initialization parameters unspecified. If the
	DataContainer object is asked to do something it can't, it will hopefully
	complain politely. Or at least informatively.
	"""

	def __init__(self, audio_dirs=None, segment_dirs=None, spec_dirs=None, \
		feature_dirs=None, projection_dirs=None, plots_dir='', \
		model_filename=None, template_dir=None, verbose=True):
		self.audio_dirs = audio_dirs
		self.segment_dirs = segment_dirs
		self.spec_dirs = spec_dirs
		self.feature_dirs = feature_dirs
		self.projection_dirs = projection_dirs
		self.plots_dir = plots_dir
		self.model_filename = model_filename
		self.template_dir = template_dir
		self.verbose = verbose
		self.sylls_per_file = None # syllables in each hdf5 file in spec_dirs
		self.fields = self._check_for_fields()
		if self.plots_dir not in [None, ''] and \
					not os.path.exists(self.plots_dir):
			os.makedirs(self.plots_dir)


[docs]	def request(self, field):
		"""
		Request some type of data.

		Parameters
		----------
		field : str
			The type of data being requested. Should come from ...

		Raises
		------
		`NotImplementedError`
			when `field` is not recognized.

		Note
		----
		Besides `__init__` and `clear_projections`, this should be the only
		external-facing method.
		"""
		if field not in ALL_FIELDS:
			print(str(field) + " is not a valid field!")
			raise NotImplementedError
		# If it's not here, make it and return it.
		if field not in self.fields:
			if self.verbose:
				print("Making field:", field)
			data = self._make_field(field)
		# Otherwise, read it and return it.
		else:
			if self.verbose:
				print("Reading field:", field)
			data = self._read_field(field)
		if self.verbose:
			print("\tDone with:", field)
		return data


[docs]	def clear_projections(self):
		"""
		Remove all projections.

		This deletes all the ``.hdf5`` files in ``self.projection_dirs``.
		"""
		for proj_dir in self.projection_dirs:
			if not os.path.exists(proj_dir):
				continue
			fns = [os.path.join(proj_dir, i) for i in os.listdir(proj_dir)]
			fns = [i for i in fns if len(i) > 5 and i[-5:] == '.hdf5']
			for fn in fns:
				os.remove(fn)
		self.fields = self._check_for_fields()


	def _make_field(self, field):
		"""Make a field."""
		if field == 'latent_means':
			data = self._make_latent_means()
		elif field == 'latent_mean_pca':
			data = self._make_latent_mean_pca_projection()
		elif field == 'latent_mean_umap':
			data = self._make_latent_mean_umap_projection()
		elif field in MUPET_FIELDS:
			data = self._make_feature_field(field, kind='mupet')
		elif field in DEEPSQUEAK_FIELDS:
			data = self._make_feature_field(field, kind='deepsqueak')
		elif field in SAP_FIELDS:
			data = self._make_feature_field(field, kind='sap')
		elif field in FILENAME_FIELDS:
			data = self._read_filename_field(field)
		elif field == 'specs':
			raise NotImplementedError
		else:
			raise NotImplementedError
		# Add this field to the collection of fields that have been computed.
		self.fields[field] = 1
		if self.verbose:
			print("Making field:", field)
		return data


	def _read_field(self, field):
		"""
		Read a field from memory.

		Parameters
		----------
		field : str
			Field name to read from file. See ``ALL_FIELDS`` for possible
			fields.
		"""
		if field in AUDIO_FIELDS:
			raise NotImplementedError
		elif field == 'segments':
			return self._read_segments()
		elif field == 'segment_audio':
			return self._read_segment_audio()
		elif field in PROJECTION_FIELDS:
			load_dirs = self.projection_dirs
		elif field in SPEC_FIELDS:
			load_dirs = self.spec_dirs
		elif field in MUPET_FIELDS:
			load_dirs = self.projection_dirs
		elif field in DEEPSQUEAK_FIELDS:
			load_dirs = self.projection_dirs
		elif field in SAP_FIELDS:
			load_dirs = self.projection_dirs
		else:
			raise Exception("Can\'t read field: "+field+"\n This should have \
				been caught in self.request!")
		to_return = []
		for i in range(len(self.spec_dirs)):
			spec_dir, load_dir = self.spec_dirs[i], load_dirs[i]
			hdf5s = get_hdf5s_from_dir(spec_dir)
			for j, hdf5 in enumerate(hdf5s):
				filename = os.path.join(load_dir, os.path.split(hdf5)[-1])
				with h5py.File(filename, 'r') as f:
					assert field in f, "Can\'t find field \'"+field+"\' in"+ \
						" file \'"+filename+"\'!"
					if field == 'audio_filenames':
						data = np.array([k.decode('UTF-8') for k in f[field]])
						to_return.append(data)
					else:
						to_return.append(np.array(f[field]))
		return np.concatenate(to_return)


	def _read_segment_audio(self):
		"""
		Read all the segmented audio and return it.

		result[audio_dir][audio_filename] = [audio_1, audio_2, ..., audio_n]
		"""
		self._check_for_dirs(['audio_dirs'], 'audio')
		segments = self.request('segments')
		result = {}
		for audio_dir in self.audio_dirs:
			dir_result = {}
			audio_fns = [i for i in os.listdir(audio_dir) if _is_wav_file(i) \
				and i in segments[audio_dir]]
			for audio_fn in audio_fns:
				with warnings.catch_warnings():
					warnings.filterwarnings("ignore", category=WavFileWarning)
					fs, audio = wavfile.read(os.path.join(audio_dir, audio_fn))
				fn_result = []
				for seg in segments[audio_dir][audio_fn]:
					i1 = int(round(seg[0]*fs))
					i2 = int(round(seg[1]*fs))
					fn_result.append(audio[i1:i2])
				dir_result[audio_fn] = fn_result
			result[audio_dir] = dir_result
		return result


	def _read_segments(self):
		"""
		Return all the segmenting decisions.

		Return a dictionary mapping audio directories to audio filenames to
		numpy arrays of shape [num_segments,2] containing onset and offset
		times.

		TO DO: add support for other delimiters, file extstensions, etc.

		Returns
		-------
		segments : dict
			Maps audio directories to audio filenames to numpy arrays.
		"""
		self._check_for_dirs(['audio_dirs', 'segment_dirs'], 'segments')
		result = {}
		for audio_dir, seg_dir in zip(self.audio_dirs, self.segment_dirs):
			dir_result = {}
			seg_fns = [os.path.join(seg_dir, i) for i in os.listdir(seg_dir) \
				if _is_seg_file(i)]
			audio_fns = [os.path.split(i)[1][:-4]+'.wav' for i in seg_fns]
			for audio_fn, seg_fn in zip(audio_fns, seg_fns):
				segs = _read_columns(seg_fn, delimiter='\t', unpack=False, \
					skiprows=0)
				if len(segs) > 0:
					dir_result[audio_fn] = segs
			result[audio_dir] = dir_result
		return result


	def _make_latent_means(self):
		"""
		Write latent means for the syllables in self.spec_dirs.

		Returns
		-------
		latent_means : numpy.ndarray
			Latent means of shape (max_num_syllables, z_dim)

		Note
		----
		* Duplicated code with ``_write_projection``?
		"""
		self._check_for_dirs(['projection_dirs', 'spec_dirs', 'model_filename'],\
			'latent_means')
		# First, see how many syllables are in each file.
		temp = get_hdf5s_from_dir(self.spec_dirs[0])
		assert len(temp) > 0, "Found no specs in" + self.spec_dirs[0]
		hdf5_file = temp[0]
		with h5py.File(hdf5_file, 'r') as f:
			self.sylls_per_file = len(f['specs'])
		spf = self.sylls_per_file
		# Load the model, making sure to get z_dim correct.
		map_loc = 'cuda' if torch.cuda.is_available() else 'cpu'
		z_dim = torch.load(self.model_filename, map_location=map_loc)['z_dim']
		model = VAE(z_dim=z_dim)
		model.load_state(self.model_filename)
		# For each directory...
		all_latent = []
		for i in range(len(self.spec_dirs)):
			spec_dir, proj_dir = self.spec_dirs[i], self.projection_dirs[i]
			# Make the projection directory if it doesn't exist.
			if proj_dir != '' and not os.path.exists(proj_dir):
				os.makedirs(proj_dir)
			# Make a DataLoader for the syllables.
			partition = get_syllable_partition([spec_dir], 1, shuffle=False)
			try:
				loader = get_syllable_data_loaders(partition, \
					shuffle=(False,False))['train']
				# Get the latent means from the model.
				latent_means = model.get_latent(loader)
				all_latent.append(latent_means)
				# Write them to the corresponding projection directory.
				hdf5s = get_hdf5s_from_dir(spec_dir)
				assert len(latent_means) // len(hdf5s) == spf
				for j in range(len(hdf5s)):
					filename = os.path.join(proj_dir, os.path.split(hdf5s[j])[-1])
					data = latent_means[j*spf:(j+1)*spf]
					with h5py.File(filename, 'a') as f:
						f.create_dataset('latent_means', data=data)
			except AssertionError: # No specs in this directory
				pass
		return np.concatenate(all_latent)


	def _read_filename_field(self, field):
		if field == 'sap_time':
			data = self._make_sap_time()
		else:
			raise NotImplementedError
		return data


	def _make_sap_time(self):
		"""Return time in seconds, following SAP conventions."""
		onsets = self.request('syllable_start')
		fns = self.request('audio_filenames')
		result = np.zeros(lemn(onsets))
		for i, onset, fn in zip(range(len(onsets)), onsets, fns):
			# December 29, 1899, 7pm is the SAP anchor time.
			anchor = mktime(strptime("1899 12 29 19", "%Y %m %d %H"))
			temp = os.path.split(fn)[-1].split('_')[1].split('.')
			day = float(temp[0])
			millisecond = float(temp[1])
			time = anchor + 24*60*60*day + 1e-3*millisecond
			result[i] = time + onset
		return result


	def _make_latent_mean_umap_projection(self):
		"""Project latent means to two dimensions with UMAP."""
		# Get latent means.
		latent_means = self.request('latent_means')
		# UMAP them.
		transform = umap.UMAP(n_components=2, n_neighbors=20, min_dist=0.1, \
				metric='euclidean', random_state=42)
		if self.verbose:
			print("Running UMAP... (n="+str(len(latent_means))+")")
		# https://github.com/lmcinnes/umap/issues/252
		with warnings.catch_warnings():
			try:
				warnings.filterwarnings("ignore", \
						category=NumbaPerformanceWarning)
			except NameError:
				pass
			embedding = transform.fit_transform(latent_means)
		if self.verbose:
			print("\tDone.")
		# Write to files.
		self._write_projection("latent_mean_umap", embedding)
		return embedding


	def _make_latent_mean_pca_projection(self):
		"""Project latent means to two dimensions with PCA."""
		# Get latent means.
		latent_means = self.request('latent_means')
		# UMAP them.
		transform = PCA(n_components=2, copy=False, random_state=42)
		if self.verbose:
			print("Running PCA...")
		embedding = transform.fit_transform(latent_means)
		if self.verbose:
			print("\tDone.")
		# Write to files.
		self._write_projection("latent_mean_pca", embedding)
		return embedding


	def _make_feature_field(self, field, kind):
		"""
		Read a feature from a text file and put it in an hdf5 file.

		Read from self.feature_dirs and write to self.projection_dirs. This
		could be a bit tricky because we need to match up the syllables in the
		text file with the ones in the hdf5 file.

		Parameters
		----------
		field : str
			Name of data being requested. See ``ALL_FIELDS`` for a complete
			list.
		kind : str, 'mupet' or 'deepsqueak'
			Is this a MUPET or a DeepSqueak field?

		Returns
		-------
		data : numpy.ndarray
			Requested data.
		"""
		self._check_for_dirs( \
			['spec_dirs', 'feature_dirs', 'projection_dirs'], field)
		# Find which column the field is stored in.
		if kind == 'mupet':
			file_fields = MUPET_FIELDS
			onset_col = MUPET_ONSET_COL
		elif kind == 'deepsqueak':
			file_fields = DEEPSQUEAK_FIELDS
			onset_col = DEEPSQUEAK_ONSET_COL
		elif kind == 'sap':
			file_fields = SAP_FIELDS
			onset_col = SAP_ONSET_COL
		else:
			assert NotImplementedError
		field_col = file_fields.index(field)
		to_return = []
		# Run through each directory.
		for i in range(len(self.spec_dirs)):
			spec_dir = self.spec_dirs[i]
			feature_dir = self.feature_dirs[i]
			proj_dir = self.projection_dirs[i]
			hdf5s = get_hdf5s_from_dir(spec_dir)
			current_fn, k = None, None
			for hdf5 in hdf5s:
				# Get the filenames and onsets from self.spec_dirs.
				with h5py.File(hdf5, 'r') as f:
					audio_filenames = np.array(f['audio_filenames'])
					spec_onsets = np.array(f['onsets'])
					# if kind == 'sap': # SAP writes onsets in milliseconds.
					# 	spec_onsets /= 1e3
				feature_arr = np.zeros(len(spec_onsets))
				# Loop through each syllable.
				for j in range(len(spec_onsets)):
					audio_fn, spec_onset = audio_filenames[j], spec_onsets[j]
					audio_fn = audio_fn.decode('UTF-8')
					# Update the feature file, if needed.
					if audio_fn != current_fn:
						current_fn = audio_fn
						feature_fn = os.path.split(audio_fn)[-1][:-4]
						if kind == 'deepsqueak':   # DeepSqueak appends '_Stats'
							feature_fn += '_Stats' # when exporting features.
						feature_fn += '.csv'
						feature_fn = os.path.join(feature_dir, feature_fn)
						# Read the onsets and features.
						feature_onsets, features = \
							_read_columns(feature_fn, [onset_col, field_col])
						if kind == 'sap': # SAP writes onsets in milliseconds.
							feature_onsets /= 1e3
						k = 0
					# Look for the corresponding onset in the feature file.
					while spec_onset > feature_onsets[k] + 0.01:
						k += 1
						assert k < len(feature_onsets)
					if abs(spec_onset - feature_onsets[k]) > 0.01:
						print("Mismatch between spec_dirs and feature_dirs!")
						print("hdf5 file:", hdf5)
						print("\tindex:", j)
						print("audio filename:", audio_fn)
						print("feature filename:", feature_fn)
						print("Didn't find spec_onset", spec_onset)
						print("in feature onsets of min:", \
								np.min(feature_onsets), "max:", \
								np.max(feature_onsets))
						print("field:", field)
						print("kind:", kind)
						quit()
					# And add it to the feature array.
					feature_arr[j] = features[k]
				# Write the fields to self.projection_dirs.
				write_fn = os.path.join(proj_dir, os.path.split(hdf5)[-1])
				with h5py.File(write_fn, 'a') as f:
					f.create_dataset(field, data=feature_arr)
				to_return.append(feature_arr)
		self.fields[field] = 1
		return np.concatenate(to_return)


	def _write_projection(self, key, data):
		"""Write the given projection to self.projection_dirs."""
		sylls_per_file = self.sylls_per_file
		# For each directory...
		k = 0
		for i in range(len(self.projection_dirs)):
			spec_dir, proj_dir = self.spec_dirs[i], self.projection_dirs[i]
			hdf5s = get_hdf5s_from_dir(spec_dir)
			for j in range(len(hdf5s)):
				filename = os.path.join(proj_dir, os.path.split(hdf5s[j])[-1])
				to_write = data[k:k+sylls_per_file]
				with h5py.File(filename, 'a') as f:
					f.create_dataset(key, data=to_write)
				k += sylls_per_file


	def _check_for_fields(self):
		"""Check to see which fields are saved."""
		fields = {}
		# If self.spec_dirs is registered, assume everything is there.
		if self.spec_dirs is not None:
			for field in SPEC_FIELDS:
				fields[field] = 1
		# Same for self.audio_dirs.
		if self.audio_dirs is not None:
			fields['audio'] = 1
		# Same for self.segment_dirs.
		if self.segment_dirs is not None:
			fields['segments'] = 1
			fields['segment_audio'] = 1
		# If self.projection_dirs is registered, see what we have.
		# If it's in one file, assume it's in all of them.
		if self.projection_dirs is not None:
			if os.path.exists(self.projection_dirs[0]):
				hdf5s = get_hdf5s_from_dir(self.projection_dirs[0])
				if len(hdf5s) > 0:
					hdf5 = hdf5s[0]
					if os.path.exists(hdf5):
						with h5py.File(hdf5, 'r') as f:
							for key in f.keys():
								if key in ALL_FIELDS:
									fields[key] = 1
									self.sylls_per_file = len(f[key])
		return fields


	def _check_for_dirs(self, dir_names, field):
		"""Check that the given directories exist."""
		for dir_name in dir_names:
			if dir_name == 'audio_dirs':
				temp = self.audio_dirs
			elif dir_name == 'segment_dirs':
				temp = self.segment_dirs
			elif dir_name == 'spec_dirs':
				temp = self.spec_dirs
			elif dir_name == 'feature_dirs':
				temp = self.feature_dirs
			elif dir_name == 'projection_dirs':
				temp = self.projection_dirs
			elif dir_name == 'model_filename':
				temp = self.model_filename
			else:
				raise NotImplementedError
			assert temp is not None, dir_name + " must be specified before " + \
				field + " is made!"



def _read_columns(filename, columns=(0,1), delimiter=',', skiprows=1, \
	unpack=True):
	"""
	A wrapper around numpy.loadtxt to handle empty files.

	TO DO: Add categorical variables.
	"""
	data = np.loadtxt(filename, delimiter=delimiter, usecols=columns, \
		skiprows=skiprows).reshape(-1,len(columns))
	if unpack:
		return tuple(data[:,i] for i in range(data.shape[1]))
	return data


def _is_seg_file(filename):
	"""Is this a segmenting file?"""
	return len(filename) > 4 and filename[-4:] == '.txt'


def _is_wav_file(filename):
	"""Is this a wav file?"""
	return len(filename) > 4 and filename[-4:] == '.wav'



if __name__ == '__main__':
	pass


###