Modified creat_hdf5_file to admit group_by_list of functions that will be used to create group hierarchy.
This commit is contained in:
83
hdf5_lib.py
83
hdf5_lib.py
@ -1,6 +1,6 @@
|
||||
import pandas as pd
|
||||
import h5py
|
||||
#import os
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
@ -103,3 +103,84 @@ def read_hdf5_as_dataframe_v2(filename):
|
||||
|
||||
return output_dataframe
|
||||
|
||||
def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None):
|
||||
|
||||
"""
|
||||
group_by_funcs (list of callables): returns a pd.Series, whose unique values denote groups of input_data elements.
|
||||
"""
|
||||
|
||||
if not all([callable(func) for func in group_by_funcs]):
|
||||
raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.")
|
||||
# Check whether input_data is a valid file system path or a dataframe
|
||||
|
||||
check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False
|
||||
|
||||
if check_possible_path(input_data):
|
||||
file_list = os.listdir(input_data)
|
||||
df = pd.DataFrame(file_list,columns='filename')
|
||||
elif isinstance(input_data,pd.DataFrame):
|
||||
df = input_data.copy()
|
||||
else:
|
||||
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
|
||||
|
||||
for i, func in enumerate(group_by_funcs):
|
||||
df['level_'+str(i)+'_groups'] = func(df)
|
||||
|
||||
# 2. Validate group hierarchy, lower level groups must be embedded in higher level groups
|
||||
|
||||
# 3. Create hdf5 file with groups defined by the 'file_group' column
|
||||
#
|
||||
# Add datasets to groups and the groups and the group's attributes
|
||||
|
||||
return df
|
||||
|
||||
def get_filetype(df):
|
||||
return [os.path.splitext(item)[1][1::] for item in df['filename']]
|
||||
|
||||
def group_by_filenumber(df):
|
||||
return [item[0:item.find('_')] for item in df['filename']]
|
||||
|
||||
def group_by_df_column(df, column_name: str):
|
||||
"""
|
||||
df (pandas.DataFrame):
|
||||
column_name (str): column_name of df by which grouping operation will take place.
|
||||
"""
|
||||
|
||||
if not column_name in df.columns:
|
||||
raise ValueError("column_name must be in the columns of df.")
|
||||
|
||||
return df[column_name]
|
||||
|
||||
def main():
|
||||
|
||||
# input data frame
|
||||
input_data = read_hdf5_as_dataframe_v2('input_files\\BeamTimeMetaData.h5')
|
||||
|
||||
# Rename column 'name' with 'filename'. get_filetype finds filetypes based on extension of filenames assumed to be located at the column 'filename'.
|
||||
input_data = input_data.rename(columns = {'name':'filename'})
|
||||
# Add column with filetypes to input_data
|
||||
input_data['filetype'] = get_filetype(input_data)
|
||||
print(input_data['filetype'].unique())
|
||||
# Reduce input_data to files of ibw type
|
||||
input_data = input_data.loc[input_data['filetype']=='ibw',:]
|
||||
|
||||
|
||||
|
||||
#group_by_func = lambda df: [item[0:item.find('_')] for item in df['name']]
|
||||
|
||||
#group_by_func
|
||||
|
||||
group_by_sample = lambda x : group_by_df_column(x,'sample')
|
||||
df = create_hdf5_file(input_data,[group_by_filenumber,group_by_sample])
|
||||
|
||||
df['file_group']
|
||||
|
||||
print(df.head())
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
main()
|
||||
|
||||
|
Reference in New Issue
Block a user