diff --git a/hdf5_lib.py b/hdf5_lib.py index e9bb442..3780814 100644 --- a/hdf5_lib.py +++ b/hdf5_lib.py @@ -1,6 +1,6 @@ import pandas as pd import h5py -#import os +import os import sys import numpy as np @@ -103,3 +103,84 @@ def read_hdf5_as_dataframe_v2(filename): return output_dataframe +def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None): + + """ + group_by_funcs (list of callables): returns a pd.Series, whose unique values denote groups of input_data elements. + """ + + if not all([callable(func) for func in group_by_funcs]): + raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.") + # Check whether input_data is a valid file system path or a dataframe + + check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False + + if check_possible_path(input_data): + file_list = os.listdir(input_data) + df = pd.DataFrame(file_list,columns='filename') + elif isinstance(input_data,pd.DataFrame): + df = input_data.copy() + else: + raise ValueError("input_data must be either a valid file-system path or a dataframe.") + + for i, func in enumerate(group_by_funcs): + df['level_'+str(i)+'_groups'] = func(df) + + # 2. Validate group hierarchy, lower level groups must be embedded in higher level groups + + # 3. Create hdf5 file with groups defined by the 'file_group' column + # + # Add datasets to groups and the groups and the group's attributes + + return df + +def get_filetype(df): + return [os.path.splitext(item)[1][1::] for item in df['filename']] + +def group_by_filenumber(df): + return [item[0:item.find('_')] for item in df['filename']] + +def group_by_df_column(df, column_name: str): + """ + df (pandas.DataFrame): + column_name (str): column_name of df by which grouping operation will take place. + """ + + if not column_name in df.columns: + raise ValueError("column_name must be in the columns of df.") + + return df[column_name] + +def main(): + + # input data frame + input_data = read_hdf5_as_dataframe_v2('input_files\\BeamTimeMetaData.h5') + + # Rename column 'name' with 'filename'. get_filetype finds filetypes based on extension of filenames assumed to be located at the column 'filename'. + input_data = input_data.rename(columns = {'name':'filename'}) + # Add column with filetypes to input_data + input_data['filetype'] = get_filetype(input_data) + print(input_data['filetype'].unique()) + # Reduce input_data to files of ibw type + input_data = input_data.loc[input_data['filetype']=='ibw',:] + + + + #group_by_func = lambda df: [item[0:item.find('_')] for item in df['name']] + + #group_by_func + + group_by_sample = lambda x : group_by_df_column(x,'sample') + df = create_hdf5_file(input_data,[group_by_filenumber,group_by_sample]) + + df['file_group'] + + print(df.head()) + + + + +if __name__ == '__main__': + + main() +