Implemented 1) function to add metadata to root folder in existing hdf5 file, 2) piece of code to display root folder's metadata on treemap's hoover.

This commit is contained in:
2023-11-02 15:46:14 +01:00
parent 86be738216
commit 25c0f07cc3

View File

@@ -10,13 +10,13 @@ import plotly.express as px
import plotly.graph_objects as go import plotly.graph_objects as go
from plotly.subplots import make_subplots from plotly.subplots import make_subplots
def read_hdf5_as_dataframe(filename): def read_mtable_as_dataframe(filename):
""" Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input h5. file """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input h5. file
contains as many groups as rows in the Matlab Table, and each group stores dataset-like variables in the Table as contains as many groups as rows in the Matlab Table, and each group stores dataset-like variables in the Table as
Datasets while categorical and numerical variables in the table are represented as attributes of each group. Datasets while categorical and numerical variables in the table are represented as attributes of each group.
Note:DataFrame is constructed columnwise to ensure homogenous data columns. Note: DataFrame is constructed columnwise to ensure homogenous data columns.
Parameters: Parameters:
@@ -69,11 +69,12 @@ def read_hdf5_as_dataframe(filename):
return output_dataframe return output_dataframe
def is_callable_list(x : list): def is_callable_list(x : list):
return all([callable(item) for item in x]) return all([callable(item) for item in x])
def is_str_list(x : list): def is_str_list(x : list):
return all([isinstance(item,str) for item in x]) return all([isinstance(item,str) for item in x])
def is_nested_hierarchy(df) -> bool: def is_nested_hierarchy(df) -> bool:
"""receives a dataframe with categorical columns and checks whether rows form a nested group hierarchy. """receives a dataframe with categorical columns and checks whether rows form a nested group hierarchy.
That is, from bottom to top, subsequent hierarchical levels contain nested groups. The lower level groups belong to exactly one group in the higher level group. That is, from bottom to top, subsequent hierarchical levels contain nested groups. The lower level groups belong to exactly one group in the higher level group.
@@ -103,8 +104,6 @@ def is_nested_hierarchy(df) -> bool:
return all([are_nested(df_tmp,'level_'+str(i)+'_groups','level_'+str(i+1)+'_groups') for i in range(len(df_tmp.columns)-1)]) return all([are_nested(df_tmp,'level_'+str(i)+'_groups','level_'+str(i+1)+'_groups') for i in range(len(df_tmp.columns)-1)])
def get_attr_names(input_data): def get_attr_names(input_data):
# TODO: extend this to file-system paths # TODO: extend this to file-system paths
@@ -115,11 +114,10 @@ def get_attr_names(input_data):
def create_group_hierarchy(obj, df, columns): def create_group_hierarchy(obj, df, columns):
""" """
Input: Input:
obj (h5py.File or h5py.Group) obj (h5py.File or h5py.Group)
columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
""" """
if not columns: if not columns:
@@ -128,6 +126,9 @@ def create_group_hierarchy(obj, df, columns):
# Determine categories associated with first categorical column # Determine categories associated with first categorical column
unique_values = df[columns[0]].unique() unique_values = df[columns[0]].unique()
if obj.name == '/':
obj.attrs.create('count',df.shape[0])
for group_name in unique_values: for group_name in unique_values:
group = obj.require_group(group_name) group = obj.require_group(group_name)
@@ -136,15 +137,15 @@ def create_group_hierarchy(obj, df, columns):
sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:] sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
group.attrs.create('count',sub_df.shape[0]) group.attrs.create('count',sub_df.shape[0])
#if group_name == 'MgO powder,H2O,HCl': # if group_name == 'MgO powder,H2O,HCl':
# print('Here:',sub_df.shape) # print('Here:',sub_df.shape)
create_group_hierarchy(group, sub_df, columns[1::]) create_group_hierarchy(group, sub_df, columns[1::])
def get_parent_child_relationships(file: h5py.File): def get_parent_child_relationships(file: h5py.File):
nodes = [] nodes = ['/']
parent = [] parent = ['']
values = [] values = [file.attrs['count']]
def node_visitor(name,obj): def node_visitor(name,obj):
if isinstance(obj,h5py.Group): if isinstance(obj,h5py.Group):
@@ -184,12 +185,21 @@ def format_group_names(names: list):
def display_group_hierarchy_on_treemap(filename: str): def display_group_hierarchy_on_a_treemap(filename: str):
with h5py.File(filename,'r') as file: with h5py.File(filename,'r') as file:
nodes, parents, values = get_parent_child_relationships(file) nodes, parents, values = get_parent_child_relationships(file)
#formating_df = format_group_names(nodes + ["/"]) metadata_list = []
metadata_dict={}
for key in file.attrs.keys():
if 'metadata' in key:
metadata_dict[key[key.find('_')+1::]]= file.attrs[key]
metadata_list.append(key[key.find('_')+1::]+':'+file.attrs[key])
metadata = '<br>'.join(['<br>'] + metadata_list)
customdata_series = pd.Series(nodes)
customdata_series[0] = metadata
fig = make_subplots(1, 1, specs=[[{"type": "domain"}]],) fig = make_subplots(1, 1, specs=[[{"type": "domain"}]],)
fig.add_trace(go.Treemap( fig.add_trace(go.Treemap(
@@ -197,17 +207,25 @@ def display_group_hierarchy_on_treemap(filename: str):
parents=parents,#formating_df['formated_names'][parents], parents=parents,#formating_df['formated_names'][parents],
values=values, values=values,
branchvalues='total', branchvalues='total',
customdata= pd.Series(nodes), customdata= customdata_series,
#marker=dict( #marker=dict(
# colors=df_all_trees['color'], # colors=df_all_trees['color'],
# colorscale='RdBu', # colorscale='RdBu',
# cmid=average_score), # cmid=average_score),
#hovertemplate='<b>%{label} </b> <br> Number of files: %{value}<br> Success rate: %{color:.2f}', #hovertemplate='<b>%{label} </b> <br> Number of files: %{value}<br> Success rate: %{color:.2f}',
hovertemplate='<b>%{label} </b> <br> Count: %{value} <br> Path: %{customdata}', hovertemplate='<b>%{label} </b> <br> Count: %{value} <br> Path: %{customdata}',
name='' name='',
root_color="lightgrey"
)) ))
fig.update_layout(width = 800, height= 600, margin = dict(t=50, l=25, r=25, b=25)) fig.update_layout(width = 800, height= 600, margin = dict(t=50, l=25, r=25, b=25))
fig.show() fig.show()
def annotate_root_dir(filename,annotation_dict: dict):
with h5py.File(filename,'r+') as file:
for key in annotation_dict:
file.attrs.create('metadata_'+key, annotation_dict[key])
def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None): def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):
@@ -279,9 +297,9 @@ def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : lis
# f.create_group(join_path(group_name,subgroup_name)) # f.create_group(join_path(group_name,subgroup_name))
# Get groups at the bottom of the hierarchy # Get groups at the bottom of the hierarchy
bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth']) #bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth'])
nodes, parents, values = get_parent_child_relationships(file) #nodes, parents, values = get_parent_child_relationships(file)
print(':)') print(':)')
#fig = px.treemap(values=values,names=nodes, parents= parents) #fig = px.treemap(values=values,names=nodes, parents= parents)
#fig.update_traces(root_color="lightgrey") #fig.update_traces(root_color="lightgrey")
@@ -299,7 +317,7 @@ def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : lis
# #
# Add datasets to groups and the groups and the group's attributes # Add datasets to groups and the groups and the group's attributes
return 0 #return 0
def augment_with_filetype(df): def augment_with_filetype(df):
@@ -323,27 +341,13 @@ def group_by_df_column(df, column_name: str):
return df[column_name] return df[column_name]
def main(): def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame):
# input data frame
input_data = read_hdf5_as_dataframe('input_files\\BeamTimeMetaData.h5')
# Rename column 'name' with 'filename'. get_filetype finds filetypes based on extension of filenames assumed to be located at the column 'filename'.
input_data = input_data.rename(columns = {'name':'filename'})
# Add column with filetypes to input_data
input_data = augment_with_filenumber(input_data)
input_data = augment_with_filetype(input_data)
#input_data['filetype'] = get_filetype(input_data)
print(input_data['filetype'].unique())
# Reduce input_data to files of ibw type
input_data = input_data.loc[input_data['filetype']=='ibw', : ]
#input_data = input_data.loc[input_data['sample']!='' , : ]
sample_name = [] sample_name = []
sample_quality = [] sample_quality = []
for item in input_data['sample']: for item in input_data['sample']:
if item.find('(')!=-1: if item.find('(')!=-1:
print(item) #print(item)
sample_name.append(item[0:item.find('(')]) sample_name.append(item[0:item.find('(')])
sample_quality.append(item[item.find('(')+1:len(item)-1]) sample_quality.append(item[item.find('(')+1:len(item)-1])
else: else:
@@ -356,32 +360,48 @@ def main():
input_data['sample'] = sample_name input_data['sample'] = sample_name
input_data['data_quality'] = sample_quality input_data['data_quality'] = sample_quality
group_by_sample = lambda x : group_by_df_column(x,'sample') return input_data
group_by_type = lambda x : group_by_df_column(x,'filetype')
group_by_filenumber = lambda x : group_by_df_column(x,'filenumber')
#fig = px.treemap(values=[10,4,3,3,2],names=[1,2,3,4,5], parents=[None,1,1,1,2],hover_name=['si senhor',':)',':)',':)','bottom']) def main():
#fig = px.treemap(input_data,path=[px.Constant("BeamtimeMetadata.h5"),'sample','filenumber']) # Read BeamTimeMetaData.h5, containing Thorsten's Matlab Table
#fig.update_traces(root_color = "lightgrey") input_data_df = read_mtable_as_dataframe('input_files\\BeamTimeMetaData.h5')
#fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
#fig.show()
success = create_hdf5_file('test.h5',input_data, 'top-down', group_by_funcs = [group_by_sample, group_by_filenumber])
display_group_hierarchy_on_treemap('test.h5') # Preprocess Thorsten's input_data dataframe so that i can be used to create a newer .h5 file
# under certain grouping specificiations.
input_data_df = input_data_df.rename(columns = {'name':'filename'})
input_data_df = augment_with_filenumber(input_data_df)
input_data_df = augment_with_filetype(input_data_df)
input_data_df = split_sample_col_into_sample_and_data_quality_cols(input_data_df)
input_data_df['lastModifiedDatestr'] = input_data_df['lastModifiedDatestr'].astype('datetime64[s]')
# Define grouping functions to be passed into create_hdf5_file function. These can also be set
# as strings refering to categorical columns in input_data_df.
test_grouping_funcs = True
if test_grouping_funcs:
group_by_sample = lambda x : group_by_df_column(x,'sample')
group_by_type = lambda x : group_by_df_column(x,'filetype')
group_by_filenumber = lambda x : group_by_df_column(x,'filenumber')
else:
group_by_sample = 'sample'
group_by_type = 'filetype'
group_by_filenumber = 'filenumber'
create_hdf5_file('test.h5',input_data_df, 'top-down', group_by_funcs = [group_by_sample, group_by_type, group_by_filenumber])
annotation_dict = {'Campaign name': 'SLS-Campaign-2023',
'Users':'Thorsten, Luca, Zoe',
'Startdate': str(input_data_df['lastModifiedDatestr'].min()),
'Enddate': str(input_data_df['lastModifiedDatestr'].max())
}
annotate_root_dir('test.h5',annotation_dict)
display_group_hierarchy_on_a_treemap('test.h5')
print(':)') print(':)')
#success = create_hdf5_file('test_v2.h5',input_data, 'top-down', group_by_funcs = ['sample','filenumber','filetype'])
#df['file_group']
#print(df.head())
if __name__ == '__main__': if __name__ == '__main__':
main() main()