diff --git a/hdf5_lib.py b/hdf5_lib.py index 08529ba..0861802 100644 --- a/hdf5_lib.py +++ b/hdf5_lib.py @@ -9,12 +9,12 @@ import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots -import igor2 -from igor2.binarywave import load as loadibw + +import g5505_file_reader def read_mtable_as_dataframe(filename): - """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input h5. file + """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input .h5 file contains as many groups as rows in the Matlab Table, and each group stores dataset-like variables in the Table as Datasets while categorical and numerical variables in the table are represented as attributes of each group. @@ -70,6 +70,35 @@ def read_mtable_as_dataframe(filename): output_dataframe.loc[:,column_label] = tmp_col return output_dataframe + +def create_group_hierarchy(obj, df, columns): + + """ + Input: + obj (h5py.File or h5py.Group) + columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy + """ + + if not columns: + return + + # Determine categories associated with first categorical column + unique_values = df[columns[0]].unique() + + if obj.name == '/': + obj.attrs.create('count',df.shape[0]) + + for group_name in unique_values: + + group = obj.require_group(group_name) + group.attrs.create('column_name', columns[0]) + + sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:] + group.attrs.create('count',sub_df.shape[0]) + + # if group_name == 'MgO powder,H2O,HCl': + # print('Here:',sub_df.shape) + create_group_hierarchy(group, sub_df, columns[1::]) def is_callable_list(x : list): return all([callable(item) for item in x]) @@ -113,51 +142,27 @@ def get_attr_names(input_data): raise ValueError("input_data must be a pd.DataFrame") return input_data.columns - -def create_group_hierarchy(obj, df, columns): - - """ - Input: - obj (h5py.File or h5py.Group) - columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy - """ - - if not columns: - return - - # Determine categories associated with first categorical column - unique_values = df[columns[0]].unique() - - if obj.name == '/': - obj.attrs.create('count',df.shape[0]) - - for group_name in unique_values: - - group = obj.require_group(group_name) - group.attrs.create('column_name', columns[0]) - - sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:] - group.attrs.create('count',sub_df.shape[0]) - - # if group_name == 'MgO powder,H2O,HCl': - # print('Here:',sub_df.shape) - create_group_hierarchy(group, sub_df, columns[1::]) def get_parent_child_relationships(file: h5py.File): nodes = ['/'] parent = [''] - values = [file.attrs['count']] + #values = [file.attrs['count']] + values = [len(file.attrs['file_list'])] def node_visitor(name,obj): - if isinstance(obj,h5py.Group): + #if isinstance(obj,h5py.Group): nodes.append(obj.name) parent.append(obj.parent.name) #nodes.append(os.path.split(obj.name)[1]) #parent.append(os.path.split(obj.parent.name)[1]) - values.append(obj.attrs['count']) + if isinstance(obj,h5py.Dataset): + values.append(1) + else: + values.append(len(obj.attrs['file_list'])) file.visititems(node_visitor) + return nodes, parent, values @@ -207,8 +212,8 @@ def display_group_hierarchy_on_a_treemap(filename: str): fig.add_trace(go.Treemap( labels=nodes, #formating_df['formated_names'][nodes], parents=parents,#formating_df['formated_names'][parents], - #values=values, - branchvalues='total', + values=values, + branchvalues='remainder', customdata= customdata_series, #marker=dict( # colors=df_all_trees['color'], @@ -231,11 +236,13 @@ def annotate_root_dir(filename,annotation_dict: dict): def create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path): - with h5py.File(ofilename, 'w') as file: + with h5py.File(ofilename, 'w') as h5file: root_dir = '?##' - for dirpath, dirnames, filenames in os.walk(input_file_system_path,topdown=True): + # loops over (or visits each) subdirectories from root directory defined by input_file_sytem_path to the lower + #level subfolders + for dirpath, dirnames, filenames_list in os.walk(input_file_system_path,topdown=True): group_name = dirpath.replace(os.sep,'/') @@ -243,16 +250,30 @@ def create_hdf5_file_from_filesystem_path(ofilename,input_file_system_path): # Set root_dir to top directory path in input file system root_dir = group_name group_name = group_name.replace(root_dir,'/') - file.create_dataset(name='file_list',data=filenames) - file.attrs.create(name='count',data=len(filenames)) + #h5file.attrs.create(name='count',data=len(filenames_list)) + h5file.attrs.create(name='file_list',data=filenames_list) else: group_name = group_name.replace(root_dir+'/','/') # Group hierarchy is implicitly defined by the forward slashes - file.create_group(group_name) - file[group_name].create_dataset(name='file_list',data=filenames) - file[group_name].attrs.create(name='count',data=len(filenames)) + h5file.create_group(group_name) + h5file[group_name].attrs.create(name='file_list',data=filenames_list) - file.attrs['count'] = file.attrs['count'] + file[group_name].attrs['count'] + # TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory) + + for filename in filenames_list: + + if 'ibw' in filename: + file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(os.path.join(dirpath,filename)) + + h5file[group_name].create_dataset(name = file_dict['name'], + data = file_dict['data'], + #dtype = file_dict['dtype'], + shape = file_dict['shape']) + + #h5file[group_name][file_dict['name']].dims[0] = file_dict['dimension_units'] + + for key in file_dict['attributes_dict'].keys(): + h5file[group_name][file_dict['name']].attrs.create(name=key,data=file_dict['attributes_dict'][key]) def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None): @@ -398,14 +419,22 @@ def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame) return input_data -def main(): - inputfile_dir = 'Z:\\People\\Juan\\TypicalBeamTime' + +def main1(): + + inputfile_dir = '\\\\fs101\\5505\\People\\Juan\\TypicalBeamTime' + + #ibw_file = loadibw(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw') + + file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw') + group_by_type = lambda x : group_by_df_column(x,'filetype') - create_hdf5_file_from_filesystem_path('test2.h5', inputfile_dir) + #create_hdf5_file_from_filesystem_path('test2.h5',inputfile_dir) display_group_hierarchy_on_a_treemap('test2.h5') #create_hdf5_file('test', inputfile_dir, 'Topdown', [group_by_type], extract_attrs_func = None) +def main2(): # Read BeamTimeMetaData.h5, containing Thorsten's Matlab Table input_data_df = read_mtable_as_dataframe('input_files\\BeamTimeMetaData.h5') @@ -446,5 +475,7 @@ def main(): if __name__ == '__main__': - main() + main1() + + print(':)')