From e7bdee21da7813b5c12d3b7d739d8897b55fc167 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Thu, 15 Feb 2024 15:59:42 +0100 Subject: [PATCH] Refactored to interact with config_file.py, which sets available file readers --- src/hdf5_lib.py | 94 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 31 deletions(-) diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index 2592a81..af5c0c2 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -10,9 +10,11 @@ import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots -import g5505_file_reader +#import g5505_file_reader import g5505_utils as utils -import smog_chamber_file_reader +#import smog_chamber_file_reader + +import config_file def read_mtable_as_dataframe(filename): @@ -278,14 +280,23 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat # Filter out files with filenames not containing a keyword specified in the parameter 'select_file_keywords'. # When select_file_keywords is an empty, i.e., [], do not apply any filter on the filenames. - if select_file_keywords: - filtered_filename_list = [] + + + filtered_filename_list = [] + if select_file_keywords: for filename in filenames_list: - if any([date in filename for date in select_file_keywords]): - filtered_filename_list.append(filename) + if any([keyword in filename for keyword in select_file_keywords]): + filtered_filename_list.append(filename) else: filtered_filename_list = filenames_list.copy() + admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys()) + + for filename in filtered_filename_list.copy(): + if not any([ext in filename for ext in admissible_file_ext_list]): + filtered_filename_list.remove(filename) + + # Skip subdirectories that do not contain a keyword in the parameter 'select_dir_keywords' when it is nonempty if select_dir_keywords: if (dirpath.count(os.sep) > offset) and not any([item in dirpath for item in select_dir_keywords]): @@ -297,13 +308,15 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat # Set root_dir to top directory path in input file system root_dir = group_name group_name = group_name.replace(root_dir,'/') - #h5file.attrs.create(name='count',data=len(filenames_list)) - h5file.attrs.create(name='file_list',data=filtered_filename_list) + + h5file.attrs.create(name='filtered_file_list',data=filtered_filename_list) + h5file.attrs.create(name='file_list',data=filenames_list) else: group_name = group_name.replace(root_dir+'/','/') # Group hierarchy is implicitly defined by the forward slashes h5file.create_group(group_name) - h5file[group_name].attrs.create(name='file_list',data=filtered_filename_list) + h5file[group_name].attrs.create(name='filtered_file_list',data=filtered_filename_list) + h5file[group_name].attrs.create(name='file_list',data=filenames_list) # TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory) @@ -315,8 +328,24 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat for filename in filtered_filename_list: + # Get file extension (or file type) + file_name, file_ext = os.path.splitext(filename) + + #try: + if not 'h5' in filename: + file_obj = config_file.ext_to_reader_dict[file_ext](os.path.join(dirpath,filename)) + else: + config_file.ext_to_reader_dict[file_ext](source_file_path = os.path.join(dirpath,filename), + dest_file_obj = h5file, + dest_group_name = group_name +'/'+filename) + print(file_ext, ':)') + + + + if 'ibw' in filename: - file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(os.path.join(dirpath,filename)) + #file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(os.path.join(dirpath,filename)) + file_dict = file_obj h5file[group_name].create_dataset(name = file_dict['name'], data = file_dict['data'], @@ -327,25 +356,29 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat for key in file_dict['attributes_dict'].keys(): h5file[group_name][file_dict['name']].attrs.create(name=key,data=file_dict['attributes_dict'][key]) + + #if 'h5' in filename: + # Create copy of original file to avoid possible file corruption and work with it. + #backup_filename = 'backup_'+filename + # Path - if 'h5' in filename: + #shutil.copy(os.path.join(dirpath,filename), os.path.join(tmp_dirpath,backup_filename)) + # Open backup h5 file and copy complet filesystem directory onto a group in h5file + #with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file: + # h5file.copy(source=src_file['/'],dest= group_name +'/'+filename) + + # h5file.copy(source= file_obj, dest= group_name +'/'+filename) - # Create copy of original file to avoid possible file corruption and work with it. - backup_filename = 'backup_'+filename - # Path - - shutil.copy(os.path.join(dirpath,filename), os.path.join(tmp_dirpath,backup_filename)) - # Open backup h5 file and copy complet filesystem directory onto a group in h5file - with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file: - h5file.copy(source=src_file['/'],dest= group_name +'/'+filename) # TODO: generilize to multiphase chemistry text and dat files # TODO: include header information from files as well if ('txt' in filename or 'TXT' in filename) and any([item in os.path.join(dirpath,filename) for item in ['smps','gas']]): - if 'smps' in os.path.join(dirpath,filename): - file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'smps') - elif 'gas' in os.path.join(dirpath,filename): - file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'gas') + #if 'smps' in os.path.join(dirpath,filename): + # file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'smps') + #elif 'gas' in os.path.join(dirpath,filename): + # file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'gas') + + file_dict = file_obj # TODO: create datasets of compound data type to include variable/or column names and datetimestamps h5file[group_name].create_group(filename) @@ -469,13 +502,10 @@ def main_5505(): inputfile_dir = '\\\\fs101\\5505\\People\\Juan\\TypicalBeamTime' - file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw') - group_by_type = lambda x : utils.group_by_df_column(x,'filetype') - select_file_keywords=[] select_dir_keywords = ['NEXAFS', 'Notes', 'Photos', 'Pressure', 'RGA', 'SES'] - create_hdf5_file_from_filesystem_path('test_sls_data.h5',inputfile_dir,select_dir_keywords,select_file_keywords) - display_group_hierarchy_on_a_treemap('test_smog_chamber_v5.h5') + create_hdf5_file_from_filesystem_path('test_sls_data_v1.h5',inputfile_dir,select_dir_keywords,select_file_keywords) + display_group_hierarchy_on_a_treemap('test_sls_data_v1.h5') #create_hdf5_file('test', inputfile_dir, 'Topdown', [group_by_type], extract_attrs_func = None) @@ -486,8 +516,8 @@ def main_smog_chamber(): include_list = ['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26'] select_date_list = ['20220726','2022.07.26'] - create_hdf5_file_from_filesystem_path('test_smog_chamber_v5.h5',inputfile_dir,include_list,select_date_list) - display_group_hierarchy_on_a_treemap('test_smog_chamber_v5.h5') + create_hdf5_file_from_filesystem_path('test_smog_chamber_v6.h5',inputfile_dir,include_list,select_date_list) + display_group_hierarchy_on_a_treemap('test_smog_chamber_v6.h5') def main_mtable_h5_from_dataframe(): @@ -533,7 +563,9 @@ def main_mtable_h5_from_dataframe(): if __name__ == '__main__': - main_mtable_h5_from_dataframe() + #main_smog_chamber() + #main_mtable_h5_from_dataframe() + main_5505() print(':)')