diff --git a/src/hdf5_ops.py b/src/hdf5_ops.py index 134d788..b819aa4 100644 --- a/src/hdf5_ops.py +++ b/src/hdf5_ops.py @@ -572,6 +572,70 @@ def get_groups_at_a_level(file: h5py.File, level: str): #file.visititems() return groups +def read_mtable_as_dataframe(filename): + + """ + Reconstruct a MATLAB Table encoded in a .h5 file as a Pandas DataFrame. + + This function reads a .h5 file containing a MATLAB Table and reconstructs it as a Pandas DataFrame. + The input .h5 file contains one group per row of the MATLAB Table. Each group stores the table's + dataset-like variables as Datasets, while categorical and numerical variables are represented as + attributes of the respective group. + + To ensure homogeneity of data columns, the DataFrame is constructed column-wise. + + Parameters + ---------- + filename : str + The name of the .h5 file. This may include the file's location and path information. + + Returns + ------- + pd.DataFrame + The MATLAB Table reconstructed as a Pandas DataFrame. + """ + + + #contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns""" + + with h5py.File(filename,'r') as file: + + # Define group's attributes and datasets. This should hold + # for all groups. TODO: implement verification and noncompliance error if needed. + group_list = list(file.keys()) + group_attrs = list(file[group_list[0]].attrs.keys()) + # + column_attr_names = [item[item.find('_')+1::] for item in group_attrs] + column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs] + + group_datasets = list(file[group_list[0]].keys()) if not 'DS_EMPTY' in file[group_list[0]].keys() else [] + # + column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets] + column_dataset_names_idx = [int(item[2:]) for item in group_datasets] + + + # Define data_frame as group_attrs + group_datasets + #pd_series_index = group_attrs + group_datasets + pd_series_index = column_attr_names + column_dataset_names + + output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list) + + tmp_col = [] + + for meas_prop in group_attrs + group_datasets: + if meas_prop in group_attrs: + column_label = meas_prop[meas_prop.find('_')+1:] + # Create numerical or categorical column from group's attributes + tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list] + else: + # Create dataset column from group's datasets + column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name'] + #tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list] + tmp_col = [file[group_key + '/' + meas_prop][()] for group_key in group_list] + + output_dataframe.loc[:,column_label] = tmp_col + + return output_dataframe if __name__ == "__main__": if len(sys.argv) < 4: