Updated function create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path), which now can integrate xps ibw files as datasets in the resulting file.

2024-02-01 15:29:03 +01:00
parent 118191d93a
commit be43367bc0
1 changed files with 80 additions and 49 deletions
--- a/hdf5_lib.py
+++ b/hdf5_lib.py
@@ -9,12 +9,12 @@ import matplotlib.pyplot as plt
 import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
-import igor2 
-from igor2.binarywave import load as loadibw
+
+import g5505_file_reader

 def read_mtable_as_dataframe(filename):

-    """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input h5. file
+    """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input .h5 file
    contains as many groups as rows in the Matlab Table, and each group stores dataset-like variables in the Table as
    Datasets while categorical and numerical variables in the table are represented as attributes of each group.

@@ -70,6 +70,35 @@ def read_mtable_as_dataframe(filename):
            output_dataframe.loc[:,column_label] = tmp_col

    return output_dataframe
+  
+def create_group_hierarchy(obj, df, columns):
+
+    """
+        Input:
+            obj (h5py.File or h5py.Group)
+            columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
+    """
+
+    if not columns:
+        return
+    
+    # Determine categories associated with first categorical column
+    unique_values = df[columns[0]].unique()
+
+    if obj.name == '/':
+        obj.attrs.create('count',df.shape[0])
+
+    for group_name in unique_values:
+
+        group = obj.require_group(group_name)
+        group.attrs.create('column_name', columns[0])
+        
+        sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
+        group.attrs.create('count',sub_df.shape[0])
+
+        # if group_name == 'MgO powder,H2O,HCl':
+        #   print('Here:',sub_df.shape)
+        create_group_hierarchy(group, sub_df, columns[1::])

 def is_callable_list(x : list):
    return all([callable(item) for item in x])
@@ -113,51 +142,27 @@ def get_attr_names(input_data):
        raise ValueError("input_data must be a pd.DataFrame")

    return input_data.columns
-   
-def create_group_hierarchy(obj, df, columns):
-
-    """
-        Input:
-            obj (h5py.File or h5py.Group)
-            columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
-    """
-
-    if not columns:
-        return
-    
-    # Determine categories associated with first categorical column
-    unique_values = df[columns[0]].unique()
-
-    if obj.name == '/':
-        obj.attrs.create('count',df.shape[0])
-
-    for group_name in unique_values:
-
-        group = obj.require_group(group_name)
-        group.attrs.create('column_name', columns[0])
-        
-        sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
-        group.attrs.create('count',sub_df.shape[0])
-
-        # if group_name == 'MgO powder,H2O,HCl':
-        #   print('Here:',sub_df.shape)
-        create_group_hierarchy(group, sub_df, columns[1::])

 def get_parent_child_relationships(file: h5py.File):

    nodes = ['/']
    parent = ['']
-    values = [file.attrs['count']]
+    #values = [file.attrs['count']]
+    values = [len(file.attrs['file_list'])]

    def node_visitor(name,obj):
-        if isinstance(obj,h5py.Group):
+        #if isinstance(obj,h5py.Group):
            nodes.append(obj.name)
            parent.append(obj.parent.name)
            #nodes.append(os.path.split(obj.name)[1])
            #parent.append(os.path.split(obj.parent.name)[1])
-            values.append(obj.attrs['count'])
+            if isinstance(obj,h5py.Dataset):
+                values.append(1)
+            else:
+                values.append(len(obj.attrs['file_list']))

    file.visititems(node_visitor)
+
    return nodes, parent, values    


@@ -207,8 +212,8 @@ def display_group_hierarchy_on_a_treemap(filename: str):
        fig.add_trace(go.Treemap(
            labels=nodes, #formating_df['formated_names'][nodes],
            parents=parents,#formating_df['formated_names'][parents],
-            #values=values,
-            branchvalues='total',
+            values=values,
+            branchvalues='remainder',
            customdata= customdata_series,
            #marker=dict(
            #    colors=df_all_trees['color'],
@@ -231,11 +236,13 @@ def annotate_root_dir(filename,annotation_dict: dict):

 def create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path):

-    with h5py.File(ofilename, 'w') as file:
+    with h5py.File(ofilename, 'w') as h5file:

        root_dir = '?##'

-        for dirpath, dirnames, filenames in os.walk(input_file_system_path,topdown=True):
+        # loops over (or visits each) subdirectories from root directory defined by input_file_sytem_path to the lower
+        #level subfolders 
+        for dirpath, dirnames, filenames_list in os.walk(input_file_system_path,topdown=True):

            group_name = dirpath.replace(os.sep,'/')

@@ -243,16 +250,30 @@ def create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path):
                # Set root_dir to top directory path in input file system
                root_dir = group_name                  
                group_name = group_name.replace(root_dir,'/')
-                file.create_dataset(name='file_list',data=filenames)
-                file.attrs.create(name='count',data=len(filenames))
+                #h5file.attrs.create(name='count',data=len(filenames_list))
+                h5file.attrs.create(name='file_list',data=filenames_list)
            else:
                group_name = group_name.replace(root_dir+'/','/')
                # Group hierarchy is implicitly defined by the forward slashes
-                file.create_group(group_name)
-                file[group_name].create_dataset(name='file_list',data=filenames)
-                file[group_name].attrs.create(name='count',data=len(filenames))
+                h5file.create_group(group_name)
+                h5file[group_name].attrs.create(name='file_list',data=filenames_list)

-                file.attrs['count'] = file.attrs['count'] + file[group_name].attrs['count']
+            # TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)
+            
+            for filename in filenames_list:
+                
+                if 'ibw' in filename:
+                    file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(os.path.join(dirpath,filename))
+                    
+                    h5file[group_name].create_dataset(name  = file_dict['name'], 
+                                          data  = file_dict['data'],
+                                          #dtype = file_dict['dtype'],
+                                          shape = file_dict['shape'])
+
+                    #h5file[group_name][file_dict['name']].dims[0] = file_dict['dimension_units']
+
+                    for key in file_dict['attributes_dict'].keys():
+                        h5file[group_name][file_dict['name']].attrs.create(name=key,data=file_dict['attributes_dict'][key])


 def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):
@@ -398,14 +419,22 @@ def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame)

    return input_data

-def main():

-    inputfile_dir = 'Z:\\People\\Juan\\TypicalBeamTime'
+
+def main1():
+
+    inputfile_dir = '\\\\fs101\\5505\\People\\Juan\\TypicalBeamTime'
+
+    #ibw_file = loadibw(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw')
+
+    file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw')
+
    group_by_type = lambda x : group_by_df_column(x,'filetype')
-    create_hdf5_file_from_filesystem_path('test2.h5', inputfile_dir)
+    #create_hdf5_file_from_filesystem_path('test2.h5',inputfile_dir)
    display_group_hierarchy_on_a_treemap('test2.h5')
    #create_hdf5_file('test', inputfile_dir, 'Topdown', [group_by_type], extract_attrs_func = None)

+def main2():
    # Read BeamTimeMetaData.h5, containing Thorsten's Matlab Table
    input_data_df = read_mtable_as_dataframe('input_files\\BeamTimeMetaData.h5')

@@ -446,5 +475,7 @@ def main():

 if __name__ == '__main__':

-    main()
+    main1()
+
+    print(':)')