Added a new function read_hdf5_as_dataframe_v2 which supports hdf5 files without datasets and replaced the dataframe construction procedure to a column wise basis.
This commit is contained in:
59
hdf5_lib.py
59
hdf5_lib.py
@@ -2,8 +2,18 @@ import pandas as pd
|
|||||||
import h5py
|
import h5py
|
||||||
#import os
|
#import os
|
||||||
import sys
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def is_wrapped(value):
|
||||||
|
"""returns True if value is contained in a 1 by 1 array, or False otherwise."""
|
||||||
|
if not isinstance(value,np.ndarray):
|
||||||
|
return False
|
||||||
|
elif sum(value.shape)==2:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
filename = 'FileList.h5'
|
|
||||||
|
|
||||||
def read_hdf5_as_dataframe(filename):
|
def read_hdf5_as_dataframe(filename):
|
||||||
|
|
||||||
@@ -35,13 +45,17 @@ def read_hdf5_as_dataframe(filename):
|
|||||||
tmp_row = []
|
tmp_row = []
|
||||||
for attr_key in group_attrs:
|
for attr_key in group_attrs:
|
||||||
#print(type(file[group_key].attrs[attr_key]))
|
#print(type(file[group_key].attrs[attr_key]))
|
||||||
tmp_row.append(file[group_key].attrs[attr_key])
|
df_entry = file[group_key].attrs[attr_key][()]
|
||||||
|
tmp_row.append(df_entry)
|
||||||
|
|
||||||
for ds_key in group_datasets:
|
for ds_key in group_datasets:
|
||||||
# Check dataset's type by uncommenting the line below
|
# Check dataset's type by uncommenting the line below
|
||||||
# print(type(file[group_key][ds_key][()]))
|
# print(type(file[group_key][ds_key][()]))
|
||||||
|
|
||||||
# Append to list the value of the file at dataset /group/ds
|
# Append to list the value of the file at dataset /group/ds
|
||||||
tmp_row.append(file[group_key][ds_key][()])
|
#tmp_row.append(file[group_key][ds_key][()])
|
||||||
|
#tmp_row.append(file[group_key+'/'+ds_key][()])
|
||||||
|
tmp_row.append(file[group_key+'/'+ds_key][()])
|
||||||
|
|
||||||
# Create pandas Series/measurement
|
# Create pandas Series/measurement
|
||||||
row = pd.Series(data=tmp_row,index=pd_series_index, name = group_key)
|
row = pd.Series(data=tmp_row,index=pd_series_index, name = group_key)
|
||||||
@@ -49,4 +63,43 @@ def read_hdf5_as_dataframe(filename):
|
|||||||
|
|
||||||
return output_dataframe
|
return output_dataframe
|
||||||
|
|
||||||
|
def read_hdf5_as_dataframe_v2(filename):
|
||||||
|
|
||||||
|
"""contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns"""
|
||||||
|
|
||||||
|
with h5py.File(filename,'r') as file:
|
||||||
|
|
||||||
|
# Define group's attributes and datasets. This should hold
|
||||||
|
# for all groups. TODO: implement verification and noncompliance error if needed.
|
||||||
|
group_list = list(file.keys())
|
||||||
|
group_attrs = list(file[group_list[0]].attrs.keys())
|
||||||
|
#
|
||||||
|
column_attr_names = [item[item.find('_')+1::] for item in group_attrs]
|
||||||
|
column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs]
|
||||||
|
|
||||||
|
group_datasets = list(file[group_list[0]].keys()) if not 'DS_EMPTY' in file[group_list[0]].keys() else []
|
||||||
|
#
|
||||||
|
column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets]
|
||||||
|
column_dataset_names_idx = [int(item[2:]) for item in group_datasets]
|
||||||
|
|
||||||
|
|
||||||
|
# Define data_frame as group_attrs + group_datasets
|
||||||
|
#pd_series_index = group_attrs + group_datasets
|
||||||
|
pd_series_index = column_attr_names + column_dataset_names
|
||||||
|
|
||||||
|
output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list)
|
||||||
|
|
||||||
|
tmp_col = []
|
||||||
|
|
||||||
|
for meas_prop in group_attrs + group_datasets:
|
||||||
|
if meas_prop in group_attrs:
|
||||||
|
column_label = meas_prop[meas_prop.find('_')+1:]
|
||||||
|
tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list]
|
||||||
|
else:
|
||||||
|
column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name']
|
||||||
|
tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list]
|
||||||
|
|
||||||
|
output_dataframe.loc[:,column_label] = tmp_col
|
||||||
|
|
||||||
|
return output_dataframe
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user