Source code for high5py.high5py

import numpy as _np
import h5py as _h5py


[docs]def info(filepath, name='/', return_info=False):
    """Print and return information about HDF5 file/group/dataset.

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    name: str, optional
        HDF5 group/dataset name (e.g., /group/dataset).  Defaults to root group
        ('/').
    return_info: bool, optional
        If True, return a dictionary of results.  Defaults to False.

    Returns
    -------
    info: dict, optional
        Dictionary of key, value pairs describing specified file/group/dataset.
        Only provided if return_info is True.
    """
    name = '{}'.format(name)
    with _h5py.File(filepath, 'r') as fid:
        info_dict = {'filename': fid.filename, 'name': fid[name].name}
        if isinstance(fid[name], _h5py.Group):
            info_dict['groups'] = [
                subname for subname in fid[name]
                if isinstance(fid['{}/{}'.format(name, subname)], _h5py.Group)]
            info_dict['datasets'] = [
                subname for subname in fid[name]
                if isinstance(
                    fid['{}/{}'.format(name, subname)],_h5py.Dataset)]
        if isinstance(fid[name], _h5py.Dataset):
            info_dict['datatype'] = fid[name].dtype
            info_dict['shape'] = fid[name].shape
            info_dict['size'] = fid[name].size
            info_dict['chunks'] = fid[name].chunks
            info_dict['compression'] = fid[name].compression
        info_dict['attributes'] = {
            key: val for key, val in fid[name].attrs.items()}
    for key, val in info_dict.items():
        print((
            '{:>' + '{:d}'.format(max([len(key) for key in info_dict.keys()]))
            + '}: {}').format(key, val))
    if return_info:
        return info_dict


[docs]def list_all(filepath, name='/', return_info=False):
    """List all groups and datasets in HDF5 file or group.

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    name: str, optional
        HDF5 group name (e.g., /group).  Defaults to root group ('/').
    return_into: bool, optional
        If True, return a dictionary of results.  Defaults to False.

    Returns
    -------
    info: dict, optional
        Dictionary of key, value pairs describing specified file/group.  Only
        provided if return_info is True.
    """
    all_names = []
    all_items = {}
    with _h5py.File(filepath, 'r') as fid:
        fid[name].visit(all_names.append)
        max_len = max([len(name) for name in all_names])
        def print_item(name, obj):
            all_items[name] = str(obj)
            print(
                ('{:<' + '{:d}'.format(max_len) + '}    {}').format(name, obj))
        fid[name].visititems(print_item)
    if return_info:
        return all_items


[docs]def exists(filepath, name):
    """Determine if group/dataset name exists in HDF5 file.

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    name: str
        HDF5 group/dataset name (e.g., /group/dataset).

    Returns
    -------
    exists: bool
        Boolean describing if path exists in HDF5 file.
    """
    avail_names = []
    with _h5py.File(filepath, 'r') as fid:
        fid.visit(avail_names.append)
    return name in avail_names


[docs]def load_dataset(filepath, name='data', start_index=None, end_index=None):
    """Load dataset from HDF5 file.

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    name: str, optional
        HDF5 dataset name (e.g., /group/dataset).  Defaults to 'data'.
    start_index: int, optional
        Start index for slicing HDF5 dataset.  Providing a slice index here may
        be more efficient than returning the entire dataset and then slicing.
        Defaults to None, for which no slicing will be done on the beginning of
        the dataset.
    end_index: int, optional
        End index for slicing HDF5 dataset.  Providing a slice index here may be
        more efficient than returning the entire dataset and then slicing.
        Defaults to None, for which no slicing will be done on the end of the
        dataset.

    Returns
    -------
    data: array-like, scalar, or str
        Dataset values will be returned with same type they were saved (usually
        some sort of numpy array), except that single-element arrays will be
        returned as scalars.
    """
    with _h5py.File(filepath, 'r') as fid:
        if start_index is None and end_index is None:
            data = fid[name][()]
        elif start_index is None and end_index is not None:
            data = fid[name][:end_index]
        elif start_index is not None and end_index is None:
            data = fid[name][start_index:]
        else:
            data = fid[name][start_index:end_index]
    return data


[docs]def save_dataset(
    filepath, data, name='data', description=None, overwrite=True,
    compression_level=None):
    """Save dataset to HDF5 file (overwrites file by default).

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    data: array-like, scalar, or str
        Data to save.
    name: str, optional
        HDF5 dataset name (e.g., /group/dataset).  Defaults to 'data'.
    description: str, optional
        String describing dataset.  Description is saved as an HDF5 attribute of
        the dataset.  Defaults to None, for which no description is saved.
    overwrite: bool
        If True, saving overwrites the file.  Otherwise, data is appended to the
        file.  Defaults to True.
    compression_level: int or None, optional
        Integer from 0 to 9 specifying compression level for gzip filter, which
        is available on all h5py installations and offers good compression with
        moderate speed.  Defaults to None, for which no compression/filter is
        applied.
    """
    if overwrite:
        file_mode = 'w'
    else:
        file_mode = 'a'
    with _h5py.File(filepath, file_mode) as fid:
        if compression_level is not None:
            fid.create_dataset(
                name, data=data, compression='gzip',
                compression_opts=compression_level)
        else:
            fid.create_dataset(name, data=data)
        if description is not None:
            fid[name].attrs['Description'] = description


[docs]def delete(filepath, name):
    """Delete group/dataset in HDF5 file.

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    name: str
        HDF5 name (e.g., /group/old_dataset).
    """
    with _h5py.File(filepath, 'a') as fid:
        del fid[name]


[docs]def rename(filepath, old_name, new_name, new_description=None):
    """Rename group/dataset in HDF5 file.

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    old_name: str
        Old HDF5 name (e.g., /group/old_dataset).
    new_name: str
        New HDF5 name (e.g., /group/new_dataset).
    description: str, optional
        New string describing dataset.  Description is saved as an HDF5
        attribute of the dataset.  Defaults to None, for which the old
        description is kept.
    """
    with _h5py.File(filepath, 'a') as fid:
        fid[new_name] = fid[old_name]
        if new_description is not None:
            fid[new_name].attrs['Description'] = new_description
        del fid[old_name]


[docs]def append_dataset(
    filepath, data, name='data', description=None, compression_level=None):
    """Append dataset to HDF5 file (never overwrites file).

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    data: array-like, scalar, or str
        Data to save.
    name: str, optional
        HDF5 dataset name (e.g., /group/dataset).  Defaults to 'data'.
    description: str, optional
        String describing dataset.  Description is saved as an HDF5 attribute of
        the dataset.  Defaults to None, for which no description is saved.
    compression_level: int or None, optional
        Integer from 0 to 9 specifying compression level for gzip filter, which
        is available on all h5py installations and offers good compression with
        moderate speed.  Defaults to None, for which no compression/filter is
        applied.
    """
    save_dataset(
        filepath, data, name=name, description=description, overwrite=False,
        compression_level=compression_level)


[docs]def replace_dataset(
    filepath, data, name='data', description=None, compression_level=None):
    """Replace/overwrite a dataset in an HDF5 file (do not overwrite the whole
    file).

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    data: array-like, scalar, or str
        Data to save.
    name: str, optional
        HDF5 dataset name (e.g., /group/dataset).  Defaults to 'data'.
    description: str, optional
        String describing dataset.  Description is saved as an HDF5 attribute of
        the dataset.  Defaults to None, for which no description is saved.
    compression_level: int or None, optional
        Integer from 0 to 9 specifying compression level for gzip filter, which
        is available on all h5py installations and offers good compression with
        moderate speed.  Defaults to None, for which no compression/filter is
        applied.
    """
    delete(filepath, name)
    append_dataset(
        filepath, data, name=name, description=description,
        compression_level=compression_level)


[docs]def load_attributes(filepath, name='data'):
    """Load HDF5 group/dataset attributes from HDF5 file.

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    name: str, optional
        HDF5 dataset name (e.g., /group/dataset).  Defaults to 'data'.

    Returns
    -------
    attributes: dict
        Dictionary of loaded attributes.
    """
    with _h5py.File(filepath, 'r') as fid:
        return dict(fid[name].attrs)


[docs]def save_attributes(filepath, attributes, name='data', overwrite=True):
    """Save HDF5 group/dataset attributes (overwrites existing attributes by
    default).

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    attributes: dict
        Attributes to save.
    name: str, optional
        HDF5 group/dataset name (e.g., /group/dataset).  Defaults to 'data'.
    overwrite: bool
        If True, saving overwrites existing attributes.  Otherwise, new
        attributes are appended to existing ones.  Defaults to True.
    """
    with _h5py.File(filepath, 'a') as fid:
        if overwrite:
            for key, val in fid[name].attrs.items():
                del fid[name].attrs[key]
        for key, val in attributes.items():
            fid[name].attrs[key] = val


[docs]def append_attributes(filepath, attributes, name='data'):
    """Append HDF5 group/dataset attributes (never overwrites existing
    attributes).

    Parameters
    ----------
    filepath: str
        Path to HDF5 file.
    attributes: dict
        Attributes to append.
    name: str, optional
        HDF5 group/dataset name (e.g., /group/dataset).  Defaults to 'data'.
    """
    save_attributes(filepath, attributes, name=name, overwrite=False)


[docs]def to_npz(h5_filepath, npz_filepath, name='/'):
    """Save an HDF5 group/dataset to NPZ (compressed numpy archive) format.
    Subgroups such as path/group/subgroup/dataset will be saved with array names
    such as path_group_subgroup_dataset.

    Parameters
    ----------
    h5_filepath: str
        Path to HDF5 file.
    npz_filepath: str
        Path to NPZ file.
    name: str, optional
        HDF5 group/dataset name (e.g., /group/dataset).  Defaults to root group
        ('/').
    """
    # Open file for processing
    with _h5py.File(h5_filepath, 'r') as fid:

        # Initialize root, paths to check
        dataset_names = []
        if isinstance(name, str):
            if isinstance(fid[name], _h5py.Dataset):
                root = '/'
                dataset_names = [fid[name].name]
                names_to_check = []
            else:
                root = name
                names_to_check = [name]
        else:
            root = '/'
            names_to_check = name

        # Process until there are no more paths to check
        while len(names_to_check) > 0:
            for subname in names_to_check:
                if isinstance(fid[subname], _h5py.Dataset):
                    dataset_names.append(fid[subname].name)
                    names_to_check.remove(subname)
                elif isinstance(fid[subname], _h5py.Group):
                    for subsubname in fid[subname]:
                        names_to_check.append(
                            fid['{}/{}'.format(subname, subsubname)].name)
                    names_to_check.remove(subname)

        # Generate dataset names for NPZ file starting from the specified root,
        # and replacing slashes with underscores, since NPZ files don't have
        # groups
        kwargs = {}
        for dsn in dataset_names:

            # Split off first instance of root
            key = root.join(dsn.split(root)[1:])

            # Remove leading slash, if any
            if key[0] == '/':
                key = key[1:]

            # Replace slashes with underscores
            key = key.replace('/', '_')

            # Add processed name
            kwargs[key] = fid[dsn][()]

    # Save data
    _np.savez_compressed(npz_filepath, **kwargs)


# Convert from NPZ (numpy archive) format
[docs]def from_npz(npz_filepath, h5_filepath):
    """Load data from an NPZ (compressed numpy archive) file and save to HDF5.
    NPZ array names are preserved.

    Parameters
    ----------
    npz_filepath: str
        Path to NPZ file.
    h5_filepath: str
        Path to HDF5 file.
    """
    # Open file for processing
    with _np.load(npz_filepath) as data:

        # Loop through arrays
        for idx, (key, val) in enumerate(data.items()):

            # Save to HDF5
            if idx == 0:
                save_dataset(h5_filepath, val, name=key)
            else:
                append_dataset(h5_filepath, val, name=key)