From 0248ca3c6e6b4768b47dbe3d0840e7350a09146a Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Tue, 10 Oct 2023 09:00:41 +0200
Subject: [PATCH] Added a new function read_hdf5_as_dataframe_v2 which supports
 hdf5 files without datasets and replaced the dataframe construction procedure
 to a column wise basis.

---
 hdf5_lib.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/hdf5_lib.py b/hdf5_lib.py
index 8526fcd..e9bb442 100644
--- a/hdf5_lib.py
+++ b/hdf5_lib.py
@@ -2,8 +2,18 @@ import pandas as pd
 import h5py
 #import os
 import sys
+import numpy as np
+
+def is_wrapped(value):
+    """returns True if value is contained in a 1 by 1 array, or False otherwise."""
+    if not isinstance(value,np.ndarray):
+        return False
+    elif sum(value.shape)==2:
+        return True
+    else:
+        return False
+
 
-filename = 'FileList.h5'
 
 def read_hdf5_as_dataframe(filename):
 
@@ -35,13 +45,17 @@ def read_hdf5_as_dataframe(filename):
             tmp_row = []
             for attr_key in group_attrs:
                 #print(type(file[group_key].attrs[attr_key]))
-                tmp_row.append(file[group_key].attrs[attr_key])
+                df_entry = file[group_key].attrs[attr_key][()]
+                tmp_row.append(df_entry)
+
             for ds_key in group_datasets:
                 # Check dataset's type by uncommenting the line below
                 # print(type(file[group_key][ds_key][()]))
 
                 # Append to list the value of the file at dataset /group/ds
-                tmp_row.append(file[group_key][ds_key][()])
+                #tmp_row.append(file[group_key][ds_key][()])
+                #tmp_row.append(file[group_key+'/'+ds_key][()])
+                tmp_row.append(file[group_key+'/'+ds_key][()])
 
             # Create pandas Series/measurement
             row = pd.Series(data=tmp_row,index=pd_series_index, name = group_key)
@@ -49,4 +63,43 @@ def read_hdf5_as_dataframe(filename):
 
     return output_dataframe
 
+def read_hdf5_as_dataframe_v2(filename):
+
+    """contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns"""
+
+    with h5py.File(filename,'r') as file:
+     
+        # Define group's attributes and datasets. This should hold
+        # for all groups. TODO: implement verification and noncompliance error if needed.
+        group_list = list(file.keys())
+        group_attrs = list(file[group_list[0]].attrs.keys())
+        # 
+        column_attr_names = [item[item.find('_')+1::] for item in group_attrs]
+        column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs]
+    
+        group_datasets = list(file[group_list[0]].keys()) if not 'DS_EMPTY' in file[group_list[0]].keys() else []
+        #
+        column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets]
+        column_dataset_names_idx = [int(item[2:]) for item in group_datasets]
+
+
+        # Define data_frame as group_attrs + group_datasets
+        #pd_series_index = group_attrs + group_datasets
+        pd_series_index = column_attr_names + column_dataset_names
+
+        output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list)
+
+        tmp_col = []
+
+        for meas_prop in group_attrs + group_datasets:
+            if meas_prop in group_attrs:
+                column_label = meas_prop[meas_prop.find('_')+1:]
+                tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list]
+            else:
+                column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name']
+                tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list]
+            
+            output_dataframe.loc[:,column_label] = tmp_col
+
+    return output_dataframe