From 0248ca3c6e6b4768b47dbe3d0840e7350a09146a Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Tue, 10 Oct 2023 09:00:41 +0200 Subject: [PATCH] Added a new function read_hdf5_as_dataframe_v2 which supports hdf5 files without datasets and replaced the dataframe construction procedure to a column wise basis. --- hdf5_lib.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/hdf5_lib.py b/hdf5_lib.py index 8526fcd..e9bb442 100644 --- a/hdf5_lib.py +++ b/hdf5_lib.py @@ -2,8 +2,18 @@ import pandas as pd import h5py #import os import sys +import numpy as np + +def is_wrapped(value): + """returns True if value is contained in a 1 by 1 array, or False otherwise.""" + if not isinstance(value,np.ndarray): + return False + elif sum(value.shape)==2: + return True + else: + return False + -filename = 'FileList.h5' def read_hdf5_as_dataframe(filename): @@ -35,13 +45,17 @@ def read_hdf5_as_dataframe(filename): tmp_row = [] for attr_key in group_attrs: #print(type(file[group_key].attrs[attr_key])) - tmp_row.append(file[group_key].attrs[attr_key]) + df_entry = file[group_key].attrs[attr_key][()] + tmp_row.append(df_entry) + for ds_key in group_datasets: # Check dataset's type by uncommenting the line below # print(type(file[group_key][ds_key][()])) # Append to list the value of the file at dataset /group/ds - tmp_row.append(file[group_key][ds_key][()]) + #tmp_row.append(file[group_key][ds_key][()]) + #tmp_row.append(file[group_key+'/'+ds_key][()]) + tmp_row.append(file[group_key+'/'+ds_key][()]) # Create pandas Series/measurement row = pd.Series(data=tmp_row,index=pd_series_index, name = group_key) @@ -49,4 +63,43 @@ def read_hdf5_as_dataframe(filename): return output_dataframe +def read_hdf5_as_dataframe_v2(filename): + + """contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns""" + + with h5py.File(filename,'r') as file: + + # Define group's attributes and datasets. This should hold + # for all groups. TODO: implement verification and noncompliance error if needed. + group_list = list(file.keys()) + group_attrs = list(file[group_list[0]].attrs.keys()) + # + column_attr_names = [item[item.find('_')+1::] for item in group_attrs] + column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs] + + group_datasets = list(file[group_list[0]].keys()) if not 'DS_EMPTY' in file[group_list[0]].keys() else [] + # + column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets] + column_dataset_names_idx = [int(item[2:]) for item in group_datasets] + + + # Define data_frame as group_attrs + group_datasets + #pd_series_index = group_attrs + group_datasets + pd_series_index = column_attr_names + column_dataset_names + + output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list) + + tmp_col = [] + + for meas_prop in group_attrs + group_datasets: + if meas_prop in group_attrs: + column_label = meas_prop[meas_prop.find('_')+1:] + tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list] + else: + column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name'] + tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list] + + output_dataframe.loc[:,column_label] = tmp_col + + return output_dataframe