Implemented partially validation of group hierarchy input param. It needs to be extended to deeper group hierarchies and perhaps output nonconforming lowerlevel groups.
This commit is contained in:
45
hdf5_lib.py
45
hdf5_lib.py
@ -103,12 +103,33 @@ def read_hdf5_as_dataframe_v2(filename):
|
||||
|
||||
return output_dataframe
|
||||
|
||||
def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None):
|
||||
|
||||
"""
|
||||
group_by_funcs (list of callables): returns a pd.Series, whose unique values denote groups of input_data elements.
|
||||
def is_group_hierarchy_valid(df) -> bool:
|
||||
|
||||
# TODO: generalize the code to check for deeper group hierachies.
|
||||
|
||||
df_tmp = df.copy()
|
||||
|
||||
# Create relabeling map
|
||||
for column_name in df_tmp.columns:
|
||||
category_index = pd.Series(np.arange(len(df_tmp[column_name].unique())), index=df_tmp[column_name].unique())
|
||||
df_tmp[column_name] = category_index[df_tmp[column_name].tolist()].tolist()
|
||||
# Extract level 1 group names
|
||||
l1_group_names = df_tmp['level_1_groups'].unique().tolist()
|
||||
|
||||
return all([item in l1_group_names for item in df_tmp.groupby('level_0_groups').mean()['level_1_groups'].unique()])
|
||||
|
||||
|
||||
|
||||
def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = None):
|
||||
|
||||
""" Creates an hdf5 file with at most three group levels, bottom, middle, and top level groups, where the top level group is the root '/' group.
|
||||
|
||||
input_data (pd.DataFrame | file-system path) :
|
||||
group_by_funcs (list of callables) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group.
|
||||
"""
|
||||
|
||||
|
||||
if not all([callable(func) for func in group_by_funcs]):
|
||||
raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.")
|
||||
# Check whether input_data is a valid file system path or a dataframe
|
||||
@ -123,9 +144,24 @@ def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None):
|
||||
else:
|
||||
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
|
||||
|
||||
list_of_group_cols = []
|
||||
for i, func in enumerate(group_by_funcs):
|
||||
list_of_group_cols.append('level_'+str(i)+'_groups')
|
||||
df['level_'+str(i)+'_groups'] = func(df)
|
||||
|
||||
# Check the length of group_by_funcs list is at most 2
|
||||
if len(group_by_funcs) > 2:
|
||||
# TODO: extend to more than 2 callable elements.
|
||||
raise ValueError("group_by_funcs can only contain at most two callable elements.")
|
||||
|
||||
if not is_group_hierarchy_valid(df.loc[:,list_of_group_cols]):
|
||||
raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.")
|
||||
|
||||
|
||||
|
||||
#for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()):
|
||||
|
||||
|
||||
# 2. Validate group hierarchy, lower level groups must be embedded in higher level groups
|
||||
|
||||
# 3. Create hdf5 file with groups defined by the 'file_group' column
|
||||
@ -162,7 +198,8 @@ def main():
|
||||
input_data['filetype'] = get_filetype(input_data)
|
||||
print(input_data['filetype'].unique())
|
||||
# Reduce input_data to files of ibw type
|
||||
input_data = input_data.loc[input_data['filetype']=='ibw',:]
|
||||
input_data = input_data.loc[input_data['filetype']=='ibw', : ]
|
||||
input_data = input_data.loc[input_data['sample']!='' , : ]
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user