Updated function create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path), which now can integrate xps ibw files as datasets in the resulting file.

This commit is contained in:
2024-02-01 15:29:03 +01:00
parent 118191d93a
commit be43367bc0

View File

@@ -9,12 +9,12 @@ import matplotlib.pyplot as plt
import plotly.express as px import plotly.express as px
import plotly.graph_objects as go import plotly.graph_objects as go
from plotly.subplots import make_subplots from plotly.subplots import make_subplots
import igor2
from igor2.binarywave import load as loadibw import g5505_file_reader
def read_mtable_as_dataframe(filename): def read_mtable_as_dataframe(filename):
""" Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input h5. file """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input .h5 file
contains as many groups as rows in the Matlab Table, and each group stores dataset-like variables in the Table as contains as many groups as rows in the Matlab Table, and each group stores dataset-like variables in the Table as
Datasets while categorical and numerical variables in the table are represented as attributes of each group. Datasets while categorical and numerical variables in the table are represented as attributes of each group.
@@ -70,6 +70,35 @@ def read_mtable_as_dataframe(filename):
output_dataframe.loc[:,column_label] = tmp_col output_dataframe.loc[:,column_label] = tmp_col
return output_dataframe return output_dataframe
def create_group_hierarchy(obj, df, columns):
"""
Input:
obj (h5py.File or h5py.Group)
columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
"""
if not columns:
return
# Determine categories associated with first categorical column
unique_values = df[columns[0]].unique()
if obj.name == '/':
obj.attrs.create('count',df.shape[0])
for group_name in unique_values:
group = obj.require_group(group_name)
group.attrs.create('column_name', columns[0])
sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
group.attrs.create('count',sub_df.shape[0])
# if group_name == 'MgO powder,H2O,HCl':
# print('Here:',sub_df.shape)
create_group_hierarchy(group, sub_df, columns[1::])
def is_callable_list(x : list): def is_callable_list(x : list):
return all([callable(item) for item in x]) return all([callable(item) for item in x])
@@ -113,51 +142,27 @@ def get_attr_names(input_data):
raise ValueError("input_data must be a pd.DataFrame") raise ValueError("input_data must be a pd.DataFrame")
return input_data.columns return input_data.columns
def create_group_hierarchy(obj, df, columns):
"""
Input:
obj (h5py.File or h5py.Group)
columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
"""
if not columns:
return
# Determine categories associated with first categorical column
unique_values = df[columns[0]].unique()
if obj.name == '/':
obj.attrs.create('count',df.shape[0])
for group_name in unique_values:
group = obj.require_group(group_name)
group.attrs.create('column_name', columns[0])
sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
group.attrs.create('count',sub_df.shape[0])
# if group_name == 'MgO powder,H2O,HCl':
# print('Here:',sub_df.shape)
create_group_hierarchy(group, sub_df, columns[1::])
def get_parent_child_relationships(file: h5py.File): def get_parent_child_relationships(file: h5py.File):
nodes = ['/'] nodes = ['/']
parent = [''] parent = ['']
values = [file.attrs['count']] #values = [file.attrs['count']]
values = [len(file.attrs['file_list'])]
def node_visitor(name,obj): def node_visitor(name,obj):
if isinstance(obj,h5py.Group): #if isinstance(obj,h5py.Group):
nodes.append(obj.name) nodes.append(obj.name)
parent.append(obj.parent.name) parent.append(obj.parent.name)
#nodes.append(os.path.split(obj.name)[1]) #nodes.append(os.path.split(obj.name)[1])
#parent.append(os.path.split(obj.parent.name)[1]) #parent.append(os.path.split(obj.parent.name)[1])
values.append(obj.attrs['count']) if isinstance(obj,h5py.Dataset):
values.append(1)
else:
values.append(len(obj.attrs['file_list']))
file.visititems(node_visitor) file.visititems(node_visitor)
return nodes, parent, values return nodes, parent, values
@@ -207,8 +212,8 @@ def display_group_hierarchy_on_a_treemap(filename: str):
fig.add_trace(go.Treemap( fig.add_trace(go.Treemap(
labels=nodes, #formating_df['formated_names'][nodes], labels=nodes, #formating_df['formated_names'][nodes],
parents=parents,#formating_df['formated_names'][parents], parents=parents,#formating_df['formated_names'][parents],
#values=values, values=values,
branchvalues='total', branchvalues='remainder',
customdata= customdata_series, customdata= customdata_series,
#marker=dict( #marker=dict(
# colors=df_all_trees['color'], # colors=df_all_trees['color'],
@@ -231,11 +236,13 @@ def annotate_root_dir(filename,annotation_dict: dict):
def create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path): def create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path):
with h5py.File(ofilename, 'w') as file: with h5py.File(ofilename, 'w') as h5file:
root_dir = '?##' root_dir = '?##'
for dirpath, dirnames, filenames in os.walk(input_file_system_path,topdown=True): # loops over (or visits each) subdirectories from root directory defined by input_file_sytem_path to the lower
#level subfolders
for dirpath, dirnames, filenames_list in os.walk(input_file_system_path,topdown=True):
group_name = dirpath.replace(os.sep,'/') group_name = dirpath.replace(os.sep,'/')
@@ -243,16 +250,30 @@ def create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path):
# Set root_dir to top directory path in input file system # Set root_dir to top directory path in input file system
root_dir = group_name root_dir = group_name
group_name = group_name.replace(root_dir,'/') group_name = group_name.replace(root_dir,'/')
file.create_dataset(name='file_list',data=filenames) #h5file.attrs.create(name='count',data=len(filenames_list))
file.attrs.create(name='count',data=len(filenames)) h5file.attrs.create(name='file_list',data=filenames_list)
else: else:
group_name = group_name.replace(root_dir+'/','/') group_name = group_name.replace(root_dir+'/','/')
# Group hierarchy is implicitly defined by the forward slashes # Group hierarchy is implicitly defined by the forward slashes
file.create_group(group_name) h5file.create_group(group_name)
file[group_name].create_dataset(name='file_list',data=filenames) h5file[group_name].attrs.create(name='file_list',data=filenames_list)
file[group_name].attrs.create(name='count',data=len(filenames))
file.attrs['count'] = file.attrs['count'] + file[group_name].attrs['count'] # TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)
for filename in filenames_list:
if 'ibw' in filename:
file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(os.path.join(dirpath,filename))
h5file[group_name].create_dataset(name = file_dict['name'],
data = file_dict['data'],
#dtype = file_dict['dtype'],
shape = file_dict['shape'])
#h5file[group_name][file_dict['name']].dims[0] = file_dict['dimension_units']
for key in file_dict['attributes_dict'].keys():
h5file[group_name][file_dict['name']].attrs.create(name=key,data=file_dict['attributes_dict'][key])
def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None): def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):
@@ -398,14 +419,22 @@ def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame)
return input_data return input_data
def main():
inputfile_dir = 'Z:\\People\\Juan\\TypicalBeamTime'
def main1():
inputfile_dir = '\\\\fs101\\5505\\People\\Juan\\TypicalBeamTime'
#ibw_file = loadibw(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw')
file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw')
group_by_type = lambda x : group_by_df_column(x,'filetype') group_by_type = lambda x : group_by_df_column(x,'filetype')
create_hdf5_file_from_filesystem_path('test2.h5', inputfile_dir) #create_hdf5_file_from_filesystem_path('test2.h5',inputfile_dir)
display_group_hierarchy_on_a_treemap('test2.h5') display_group_hierarchy_on_a_treemap('test2.h5')
#create_hdf5_file('test', inputfile_dir, 'Topdown', [group_by_type], extract_attrs_func = None) #create_hdf5_file('test', inputfile_dir, 'Topdown', [group_by_type], extract_attrs_func = None)
def main2():
# Read BeamTimeMetaData.h5, containing Thorsten's Matlab Table # Read BeamTimeMetaData.h5, containing Thorsten's Matlab Table
input_data_df = read_mtable_as_dataframe('input_files\\BeamTimeMetaData.h5') input_data_df = read_mtable_as_dataframe('input_files\\BeamTimeMetaData.h5')
@@ -446,5 +475,7 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
main() main1()
print(':)')