Implemented partially validation of group hierarchy input param. It needs to be extended to deeper group hierarchies and perhaps output nonconforming lowerlevel groups.

This commit is contained in:
2023-10-20 16:56:27 +02:00
parent 7eb1686d1f
commit d95a08caaa

View File

@ -103,12 +103,33 @@ def read_hdf5_as_dataframe_v2(filename):
return output_dataframe
def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None):
"""
group_by_funcs (list of callables): returns a pd.Series, whose unique values denote groups of input_data elements.
def is_group_hierarchy_valid(df) -> bool:
# TODO: generalize the code to check for deeper group hierachies.
df_tmp = df.copy()
# Create relabeling map
for column_name in df_tmp.columns:
category_index = pd.Series(np.arange(len(df_tmp[column_name].unique())), index=df_tmp[column_name].unique())
df_tmp[column_name] = category_index[df_tmp[column_name].tolist()].tolist()
# Extract level 1 group names
l1_group_names = df_tmp['level_1_groups'].unique().tolist()
return all([item in l1_group_names for item in df_tmp.groupby('level_0_groups').mean()['level_1_groups'].unique()])
def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = None):
""" Creates an hdf5 file with at most three group levels, bottom, middle, and top level groups, where the top level group is the root '/' group.
input_data (pd.DataFrame | file-system path) :
group_by_funcs (list of callables) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group.
"""
if not all([callable(func) for func in group_by_funcs]):
raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.")
# Check whether input_data is a valid file system path or a dataframe
@ -123,9 +144,24 @@ def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None):
else:
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
list_of_group_cols = []
for i, func in enumerate(group_by_funcs):
list_of_group_cols.append('level_'+str(i)+'_groups')
df['level_'+str(i)+'_groups'] = func(df)
# Check the length of group_by_funcs list is at most 2
if len(group_by_funcs) > 2:
# TODO: extend to more than 2 callable elements.
raise ValueError("group_by_funcs can only contain at most two callable elements.")
if not is_group_hierarchy_valid(df.loc[:,list_of_group_cols]):
raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.")
#for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()):
# 2. Validate group hierarchy, lower level groups must be embedded in higher level groups
# 3. Create hdf5 file with groups defined by the 'file_group' column
@ -162,7 +198,8 @@ def main():
input_data['filetype'] = get_filetype(input_data)
print(input_data['filetype'].unique())
# Reduce input_data to files of ibw type
input_data = input_data.loc[input_data['filetype']=='ibw',:]
input_data = input_data.loc[input_data['filetype']=='ibw', : ]
input_data = input_data.loc[input_data['sample']!='' , : ]