Added read_mtable_as_dataframe(filename) back so that jupyter notebook can use it to demonstrate some functionality

2024-11-23 16:31:29 +01:00
parent fd92bce802
commit 6701bc06ad
1 changed files with 64 additions and 0 deletions
--- a/src/hdf5_ops.py
+++ b/src/hdf5_ops.py
@ -572,6 +572,70 @@ def get_groups_at_a_level(file: h5py.File, level: str):
    #file.visititems()
    return groups
 def read_mtable_as_dataframe(filename):
    """
    Reconstruct a MATLAB Table encoded in a .h5 file as a Pandas DataFrame.
    This function reads a .h5 file containing a MATLAB Table and reconstructs it as a Pandas DataFrame. 
    The input .h5 file contains one group per row of the MATLAB Table. Each group stores the table's 
    dataset-like variables as Datasets, while categorical and numerical variables are represented as 
    attributes of the respective group.
    To ensure homogeneity of data columns, the DataFrame is constructed column-wise.
    Parameters
    ----------
    filename : str
        The name of the .h5 file. This may include the file's location and path information.
    Returns
    -------
    pd.DataFrame
        The MATLAB Table reconstructed as a Pandas DataFrame.
    """
    #contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns"""
    with h5py.File(filename,'r') as file:
        # Define group's attributes and datasets. This should hold
        # for all groups. TODO: implement verification and noncompliance error if needed.
        group_list = list(file.keys())
        group_attrs = list(file[group_list[0]].attrs.keys())
        # 
        column_attr_names = [item[item.find('_')+1::] for item in group_attrs]
        column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs]
        group_datasets = list(file[group_list[0]].keys()) if not 'DS_EMPTY' in file[group_list[0]].keys() else []
        #
        column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets]
        column_dataset_names_idx = [int(item[2:]) for item in group_datasets]
        # Define data_frame as group_attrs + group_datasets
        #pd_series_index = group_attrs + group_datasets
        pd_series_index = column_attr_names + column_dataset_names
        output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list)
        tmp_col = []
        for meas_prop in group_attrs + group_datasets:
            if meas_prop in group_attrs:
                column_label = meas_prop[meas_prop.find('_')+1:]
                # Create numerical or categorical column from group's attributes
                tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list]
            else:
                # Create dataset column from group's datasets
                column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name']
                #tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list]
                tmp_col = [file[group_key + '/' + meas_prop][()] for group_key in group_list]
            output_dataframe.loc[:,column_label] = tmp_col
    return output_dataframe
 if __name__ == "__main__":
    if len(sys.argv) < 4: