Source code for high5py.high5py

import numpy as _np
import h5py as _h5py


[docs]def info(filepath, name='/', return_info=False): """Print and return information about HDF5 file/group/dataset. Parameters ---------- filepath: str Path to HDF5 file. name: str, optional HDF5 group/dataset name (e.g., /group/dataset). Defaults to root group ('/'). return_info: bool, optional If True, return a dictionary of results. Defaults to False. Returns ------- info: dict, optional Dictionary of key, value pairs describing specified file/group/dataset. Only provided if return_info is True. """ name = '{}'.format(name) with _h5py.File(filepath, 'r') as fid: info_dict = {'filename': fid.filename, 'name': fid[name].name} if isinstance(fid[name], _h5py.Group): info_dict['groups'] = [ subname for subname in fid[name] if isinstance(fid['{}/{}'.format(name, subname)], _h5py.Group)] info_dict['datasets'] = [ subname for subname in fid[name] if isinstance( fid['{}/{}'.format(name, subname)],_h5py.Dataset)] if isinstance(fid[name], _h5py.Dataset): info_dict['datatype'] = fid[name].dtype info_dict['shape'] = fid[name].shape info_dict['size'] = fid[name].size info_dict['chunks'] = fid[name].chunks info_dict['compression'] = fid[name].compression info_dict['attributes'] = { key: val for key, val in fid[name].attrs.items()} for key, val in info_dict.items(): print(( '{:>' + '{:d}'.format(max([len(key) for key in info_dict.keys()])) + '}: {}').format(key, val)) if return_info: return info_dict
[docs]def list_all(filepath, name='/', return_info=False): """List all groups and datasets in HDF5 file or group. Parameters ---------- filepath: str Path to HDF5 file. name: str, optional HDF5 group name (e.g., /group). Defaults to root group ('/'). return_into: bool, optional If True, return a dictionary of results. Defaults to False. Returns ------- info: dict, optional Dictionary of key, value pairs describing specified file/group. Only provided if return_info is True. """ all_names = [] all_items = {} with _h5py.File(filepath, 'r') as fid: fid[name].visit(all_names.append) max_len = max([len(name) for name in all_names]) def print_item(name, obj): all_items[name] = str(obj) print( ('{:<' + '{:d}'.format(max_len) + '} {}').format(name, obj)) fid[name].visititems(print_item) if return_info: return all_items
[docs]def exists(filepath, name): """Determine if group/dataset name exists in HDF5 file. Parameters ---------- filepath: str Path to HDF5 file. name: str HDF5 group/dataset name (e.g., /group/dataset). Returns ------- exists: bool Boolean describing if path exists in HDF5 file. """ avail_names = [] with _h5py.File(filepath, 'r') as fid: fid.visit(avail_names.append) return name in avail_names
[docs]def load_dataset(filepath, name='data', start_index=None, end_index=None): """Load dataset from HDF5 file. Parameters ---------- filepath: str Path to HDF5 file. name: str, optional HDF5 dataset name (e.g., /group/dataset). Defaults to 'data'. start_index: int, optional Start index for slicing HDF5 dataset. Providing a slice index here may be more efficient than returning the entire dataset and then slicing. Defaults to None, for which no slicing will be done on the beginning of the dataset. end_index: int, optional End index for slicing HDF5 dataset. Providing a slice index here may be more efficient than returning the entire dataset and then slicing. Defaults to None, for which no slicing will be done on the end of the dataset. Returns ------- data: array-like, scalar, or str Dataset values will be returned with same type they were saved (usually some sort of numpy array), except that single-element arrays will be returned as scalars. """ with _h5py.File(filepath, 'r') as fid: if start_index is None and end_index is None: data = fid[name][()] elif start_index is None and end_index is not None: data = fid[name][:end_index] elif start_index is not None and end_index is None: data = fid[name][start_index:] else: data = fid[name][start_index:end_index] return data
[docs]def save_dataset( filepath, data, name='data', description=None, overwrite=True, compression_level=None): """Save dataset to HDF5 file (overwrites file by default). Parameters ---------- filepath: str Path to HDF5 file. data: array-like, scalar, or str Data to save. name: str, optional HDF5 dataset name (e.g., /group/dataset). Defaults to 'data'. description: str, optional String describing dataset. Description is saved as an HDF5 attribute of the dataset. Defaults to None, for which no description is saved. overwrite: bool If True, saving overwrites the file. Otherwise, data is appended to the file. Defaults to True. compression_level: int or None, optional Integer from 0 to 9 specifying compression level for gzip filter, which is available on all h5py installations and offers good compression with moderate speed. Defaults to None, for which no compression/filter is applied. """ if overwrite: file_mode = 'w' else: file_mode = 'a' with _h5py.File(filepath, file_mode) as fid: if compression_level is not None: fid.create_dataset( name, data=data, compression='gzip', compression_opts=compression_level) else: fid.create_dataset(name, data=data) if description is not None: fid[name].attrs['Description'] = description
[docs]def delete(filepath, name): """Delete group/dataset in HDF5 file. Parameters ---------- filepath: str Path to HDF5 file. name: str HDF5 name (e.g., /group/old_dataset). """ with _h5py.File(filepath, 'a') as fid: del fid[name]
[docs]def rename(filepath, old_name, new_name, new_description=None): """Rename group/dataset in HDF5 file. Parameters ---------- filepath: str Path to HDF5 file. old_name: str Old HDF5 name (e.g., /group/old_dataset). new_name: str New HDF5 name (e.g., /group/new_dataset). description: str, optional New string describing dataset. Description is saved as an HDF5 attribute of the dataset. Defaults to None, for which the old description is kept. """ with _h5py.File(filepath, 'a') as fid: fid[new_name] = fid[old_name] if new_description is not None: fid[new_name].attrs['Description'] = new_description del fid[old_name]
[docs]def append_dataset( filepath, data, name='data', description=None, compression_level=None): """Append dataset to HDF5 file (never overwrites file). Parameters ---------- filepath: str Path to HDF5 file. data: array-like, scalar, or str Data to save. name: str, optional HDF5 dataset name (e.g., /group/dataset). Defaults to 'data'. description: str, optional String describing dataset. Description is saved as an HDF5 attribute of the dataset. Defaults to None, for which no description is saved. compression_level: int or None, optional Integer from 0 to 9 specifying compression level for gzip filter, which is available on all h5py installations and offers good compression with moderate speed. Defaults to None, for which no compression/filter is applied. """ save_dataset( filepath, data, name=name, description=description, overwrite=False, compression_level=compression_level)
[docs]def replace_dataset( filepath, data, name='data', description=None, compression_level=None): """Replace/overwrite a dataset in an HDF5 file (do not overwrite the whole file). Parameters ---------- filepath: str Path to HDF5 file. data: array-like, scalar, or str Data to save. name: str, optional HDF5 dataset name (e.g., /group/dataset). Defaults to 'data'. description: str, optional String describing dataset. Description is saved as an HDF5 attribute of the dataset. Defaults to None, for which no description is saved. compression_level: int or None, optional Integer from 0 to 9 specifying compression level for gzip filter, which is available on all h5py installations and offers good compression with moderate speed. Defaults to None, for which no compression/filter is applied. """ delete(filepath, name) append_dataset( filepath, data, name=name, description=description, compression_level=compression_level)
[docs]def load_attributes(filepath, name='data'): """Load HDF5 group/dataset attributes from HDF5 file. Parameters ---------- filepath: str Path to HDF5 file. name: str, optional HDF5 dataset name (e.g., /group/dataset). Defaults to 'data'. Returns ------- attributes: dict Dictionary of loaded attributes. """ with _h5py.File(filepath, 'r') as fid: return dict(fid[name].attrs)
[docs]def save_attributes(filepath, attributes, name='data', overwrite=True): """Save HDF5 group/dataset attributes (overwrites existing attributes by default). Parameters ---------- filepath: str Path to HDF5 file. attributes: dict Attributes to save. name: str, optional HDF5 group/dataset name (e.g., /group/dataset). Defaults to 'data'. overwrite: bool If True, saving overwrites existing attributes. Otherwise, new attributes are appended to existing ones. Defaults to True. """ with _h5py.File(filepath, 'a') as fid: if overwrite: for key, val in fid[name].attrs.items(): del fid[name].attrs[key] for key, val in attributes.items(): fid[name].attrs[key] = val
[docs]def append_attributes(filepath, attributes, name='data'): """Append HDF5 group/dataset attributes (never overwrites existing attributes). Parameters ---------- filepath: str Path to HDF5 file. attributes: dict Attributes to append. name: str, optional HDF5 group/dataset name (e.g., /group/dataset). Defaults to 'data'. """ save_attributes(filepath, attributes, name=name, overwrite=False)
[docs]def to_npz(h5_filepath, npz_filepath, name='/'): """Save an HDF5 group/dataset to NPZ (compressed numpy archive) format. Subgroups such as path/group/subgroup/dataset will be saved with array names such as path_group_subgroup_dataset. Parameters ---------- h5_filepath: str Path to HDF5 file. npz_filepath: str Path to NPZ file. name: str, optional HDF5 group/dataset name (e.g., /group/dataset). Defaults to root group ('/'). """ # Open file for processing with _h5py.File(h5_filepath, 'r') as fid: # Initialize root, paths to check dataset_names = [] if isinstance(name, str): if isinstance(fid[name], _h5py.Dataset): root = '/' dataset_names = [fid[name].name] names_to_check = [] else: root = name names_to_check = [name] else: root = '/' names_to_check = name # Process until there are no more paths to check while len(names_to_check) > 0: for subname in names_to_check: if isinstance(fid[subname], _h5py.Dataset): dataset_names.append(fid[subname].name) names_to_check.remove(subname) elif isinstance(fid[subname], _h5py.Group): for subsubname in fid[subname]: names_to_check.append( fid['{}/{}'.format(subname, subsubname)].name) names_to_check.remove(subname) # Generate dataset names for NPZ file starting from the specified root, # and replacing slashes with underscores, since NPZ files don't have # groups kwargs = {} for dsn in dataset_names: # Split off first instance of root key = root.join(dsn.split(root)[1:]) # Remove leading slash, if any if key[0] == '/': key = key[1:] # Replace slashes with underscores key = key.replace('/', '_') # Add processed name kwargs[key] = fid[dsn][()] # Save data _np.savez_compressed(npz_filepath, **kwargs)
# Convert from NPZ (numpy archive) format
[docs]def from_npz(npz_filepath, h5_filepath): """Load data from an NPZ (compressed numpy archive) file and save to HDF5. NPZ array names are preserved. Parameters ---------- npz_filepath: str Path to NPZ file. h5_filepath: str Path to HDF5 file. """ # Open file for processing with _np.load(npz_filepath) as data: # Loop through arrays for idx, (key, val) in enumerate(data.items()): # Save to HDF5 if idx == 0: save_dataset(h5_filepath, val, name=key) else: append_dataset(h5_filepath, val, name=key)