187 lines
6.8 KiB
Python
187 lines
6.8 KiB
Python
import pandas as pd
|
|
import h5py
|
|
import os
|
|
import sys
|
|
import numpy as np
|
|
|
|
def is_wrapped(value):
|
|
"""returns True if value is contained in a 1 by 1 array, or False otherwise."""
|
|
if not isinstance(value,np.ndarray):
|
|
return False
|
|
elif sum(value.shape)==2:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
|
|
def read_hdf5_as_dataframe(filename):
|
|
|
|
with h5py.File(filename,'r') as file:
|
|
|
|
# Define group's attributes and datasets. This should hold
|
|
# for all groups. TODO: implement verification and noncompliance error if needed.
|
|
group_list = list(file.keys())
|
|
group_attrs = list(file[group_list[0]].attrs.keys())
|
|
#
|
|
column_attr_names = [item[item.find('_')+1::] for item in group_attrs]
|
|
column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs]
|
|
|
|
group_datasets = list(file[group_list[0]].keys())
|
|
#
|
|
column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets]
|
|
column_dataset_names_idx = [int(item[2:]) for item in group_datasets]
|
|
|
|
|
|
# Define data_frame as group_attrs + group_datasets
|
|
#pd_series_index = group_attrs + group_datasets
|
|
pd_series_index = column_attr_names + column_dataset_names
|
|
|
|
output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list)
|
|
|
|
for group_key in group_list:
|
|
# Print group_name
|
|
#print(group_key)
|
|
tmp_row = []
|
|
for attr_key in group_attrs:
|
|
#print(type(file[group_key].attrs[attr_key]))
|
|
df_entry = file[group_key].attrs[attr_key][()]
|
|
tmp_row.append(df_entry)
|
|
|
|
for ds_key in group_datasets:
|
|
# Check dataset's type by uncommenting the line below
|
|
# print(type(file[group_key][ds_key][()]))
|
|
|
|
# Append to list the value of the file at dataset /group/ds
|
|
#tmp_row.append(file[group_key][ds_key][()])
|
|
#tmp_row.append(file[group_key+'/'+ds_key][()])
|
|
tmp_row.append(file[group_key+'/'+ds_key][()])
|
|
|
|
# Create pandas Series/measurement
|
|
row = pd.Series(data=tmp_row,index=pd_series_index, name = group_key)
|
|
output_dataframe.loc[group_key,:] = row
|
|
|
|
return output_dataframe
|
|
|
|
def read_hdf5_as_dataframe_v2(filename):
|
|
|
|
"""contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns"""
|
|
|
|
with h5py.File(filename,'r') as file:
|
|
|
|
# Define group's attributes and datasets. This should hold
|
|
# for all groups. TODO: implement verification and noncompliance error if needed.
|
|
group_list = list(file.keys())
|
|
group_attrs = list(file[group_list[0]].attrs.keys())
|
|
#
|
|
column_attr_names = [item[item.find('_')+1::] for item in group_attrs]
|
|
column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs]
|
|
|
|
group_datasets = list(file[group_list[0]].keys()) if not 'DS_EMPTY' in file[group_list[0]].keys() else []
|
|
#
|
|
column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets]
|
|
column_dataset_names_idx = [int(item[2:]) for item in group_datasets]
|
|
|
|
|
|
# Define data_frame as group_attrs + group_datasets
|
|
#pd_series_index = group_attrs + group_datasets
|
|
pd_series_index = column_attr_names + column_dataset_names
|
|
|
|
output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list)
|
|
|
|
tmp_col = []
|
|
|
|
for meas_prop in group_attrs + group_datasets:
|
|
if meas_prop in group_attrs:
|
|
column_label = meas_prop[meas_prop.find('_')+1:]
|
|
tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list]
|
|
else:
|
|
column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name']
|
|
tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list]
|
|
|
|
output_dataframe.loc[:,column_label] = tmp_col
|
|
|
|
return output_dataframe
|
|
|
|
def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None):
|
|
|
|
"""
|
|
group_by_funcs (list of callables): returns a pd.Series, whose unique values denote groups of input_data elements.
|
|
"""
|
|
|
|
if not all([callable(func) for func in group_by_funcs]):
|
|
raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.")
|
|
# Check whether input_data is a valid file system path or a dataframe
|
|
|
|
check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False
|
|
|
|
if check_possible_path(input_data):
|
|
file_list = os.listdir(input_data)
|
|
df = pd.DataFrame(file_list,columns='filename')
|
|
elif isinstance(input_data,pd.DataFrame):
|
|
df = input_data.copy()
|
|
else:
|
|
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
|
|
|
|
for i, func in enumerate(group_by_funcs):
|
|
df['level_'+str(i)+'_groups'] = func(df)
|
|
|
|
# 2. Validate group hierarchy, lower level groups must be embedded in higher level groups
|
|
|
|
# 3. Create hdf5 file with groups defined by the 'file_group' column
|
|
#
|
|
# Add datasets to groups and the groups and the group's attributes
|
|
|
|
return df
|
|
|
|
def get_filetype(df):
|
|
return [os.path.splitext(item)[1][1::] for item in df['filename']]
|
|
|
|
def group_by_filenumber(df):
|
|
return [item[0:item.find('_')] for item in df['filename']]
|
|
|
|
def group_by_df_column(df, column_name: str):
|
|
"""
|
|
df (pandas.DataFrame):
|
|
column_name (str): column_name of df by which grouping operation will take place.
|
|
"""
|
|
|
|
if not column_name in df.columns:
|
|
raise ValueError("column_name must be in the columns of df.")
|
|
|
|
return df[column_name]
|
|
|
|
def main():
|
|
|
|
# input data frame
|
|
input_data = read_hdf5_as_dataframe_v2('input_files\\BeamTimeMetaData.h5')
|
|
|
|
# Rename column 'name' with 'filename'. get_filetype finds filetypes based on extension of filenames assumed to be located at the column 'filename'.
|
|
input_data = input_data.rename(columns = {'name':'filename'})
|
|
# Add column with filetypes to input_data
|
|
input_data['filetype'] = get_filetype(input_data)
|
|
print(input_data['filetype'].unique())
|
|
# Reduce input_data to files of ibw type
|
|
input_data = input_data.loc[input_data['filetype']=='ibw',:]
|
|
|
|
|
|
|
|
#group_by_func = lambda df: [item[0:item.find('_')] for item in df['name']]
|
|
|
|
#group_by_func
|
|
|
|
group_by_sample = lambda x : group_by_df_column(x,'sample')
|
|
df = create_hdf5_file(input_data,[group_by_filenumber,group_by_sample])
|
|
|
|
df['file_group']
|
|
|
|
print(df.head())
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|
|
|