From e7bdee21da7813b5c12d3b7d739d8897b55fc167 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 15 Feb 2024 15:59:42 +0100
Subject: [PATCH] Refactored to interact with config_file.py, which sets
 available file readers

---
 src/hdf5_lib.py | 94 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 31 deletions(-)

diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py
index 2592a81..af5c0c2 100644
--- a/src/hdf5_lib.py
+++ b/src/hdf5_lib.py
@@ -10,9 +10,11 @@ import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 
-import g5505_file_reader
+#import g5505_file_reader
 import g5505_utils as utils
-import smog_chamber_file_reader
+#import smog_chamber_file_reader 
+
+import config_file
 
 
 def read_mtable_as_dataframe(filename):
@@ -278,14 +280,23 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat
         
             # Filter out files with filenames not containing a keyword specified in the parameter 'select_file_keywords'.
             # When select_file_keywords is an empty, i.e., [], do not apply any filter on the filenames.
-            if select_file_keywords:
-                filtered_filename_list = []
+            
+            
+            filtered_filename_list = []
+            if select_file_keywords:                
                 for filename in filenames_list:
-                    if any([date in filename for date in select_file_keywords]):
-                        filtered_filename_list.append(filename)
+                    if any([keyword in filename for keyword in select_file_keywords]):
+                            filtered_filename_list.append(filename)
             else:
                 filtered_filename_list = filenames_list.copy()
 
+            admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys())
+
+            for filename in filtered_filename_list.copy():
+                if not any([ext in filename for ext in admissible_file_ext_list]):
+                    filtered_filename_list.remove(filename)
+
+
             # Skip subdirectories that do not contain a keyword in the parameter 'select_dir_keywords' when it is nonempty
             if select_dir_keywords:
                 if (dirpath.count(os.sep) > offset) and not any([item in dirpath for item in select_dir_keywords]):
@@ -297,13 +308,15 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat
                 # Set root_dir to top directory path in input file system
                 root_dir = group_name                  
                 group_name = group_name.replace(root_dir,'/')
-                #h5file.attrs.create(name='count',data=len(filenames_list))
-                h5file.attrs.create(name='file_list',data=filtered_filename_list)
+
+                h5file.attrs.create(name='filtered_file_list',data=filtered_filename_list)
+                h5file.attrs.create(name='file_list',data=filenames_list)
             else:
                 group_name = group_name.replace(root_dir+'/','/')
                 # Group hierarchy is implicitly defined by the forward slashes
                 h5file.create_group(group_name)
-                h5file[group_name].attrs.create(name='file_list',data=filtered_filename_list)
+                h5file[group_name].attrs.create(name='filtered_file_list',data=filtered_filename_list)
+                h5file[group_name].attrs.create(name='file_list',data=filenames_list)
 
 
             # TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)  
@@ -315,8 +328,24 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat
 
             for filename in filtered_filename_list:
                 
+                # Get file extension (or file type)
+                file_name, file_ext = os.path.splitext(filename)
+
+                #try: 
+                if not 'h5' in filename:
+                    file_obj = config_file.ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
+                else:
+                    config_file.ext_to_reader_dict[file_ext](source_file_path = os.path.join(dirpath,filename), 
+                                                   dest_file_obj = h5file, 
+                                                   dest_group_name = group_name +'/'+filename)
+                print(file_ext, ':)')
+
+
+
+                
                 if 'ibw' in filename:
-                    file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(os.path.join(dirpath,filename))
+                    #file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(os.path.join(dirpath,filename))
+                    file_dict = file_obj
                     
                     h5file[group_name].create_dataset(name  = file_dict['name'], 
                                           data  = file_dict['data'],
@@ -327,25 +356,29 @@ def create_hdf5_file_from_filesystem_path(ofilename : str, input_file_system_pat
 
                     for key in file_dict['attributes_dict'].keys():
                         h5file[group_name][file_dict['name']].attrs.create(name=key,data=file_dict['attributes_dict'][key])
+                        
+                #if 'h5' in filename:
+                        # Create copy of original file to avoid possible file corruption and work with it.
+                        #backup_filename = 'backup_'+filename
+                        # Path                     
 
-                if 'h5' in filename:
+                        #shutil.copy(os.path.join(dirpath,filename), os.path.join(tmp_dirpath,backup_filename))
+                        # Open backup h5 file and copy complet filesystem directory onto a group in h5file
+                        #with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file:
+                        #    h5file.copy(source=src_file['/'],dest= group_name +'/'+filename)
+                        
+                #        h5file.copy(source= file_obj, dest= group_name +'/'+filename)
 
-                    # Create copy of original file to avoid possible file corruption and work with it.
-                    backup_filename = 'backup_'+filename
-                    # Path                     
-
-                    shutil.copy(os.path.join(dirpath,filename), os.path.join(tmp_dirpath,backup_filename))
-                    # Open backup h5 file and copy complet filesystem directory onto a group in h5file
-                    with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file:
-                        h5file.copy(source=src_file['/'],dest= group_name +'/'+filename)
 
                 # TODO: generilize to multiphase chemistry text and dat files
                 # TODO: include header information from files as well
                 if ('txt' in filename or 'TXT' in filename) and any([item in os.path.join(dirpath,filename) for item in ['smps','gas']]):                    
-                    if 'smps' in os.path.join(dirpath,filename):
-                        file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'smps')
-                    elif 'gas' in os.path.join(dirpath,filename):
-                        file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'gas')
+                    #if 'smps' in os.path.join(dirpath,filename):
+                    #    file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'smps')
+                    #elif 'gas' in os.path.join(dirpath,filename):
+                    #    file_dict = smog_chamber_file_reader.read_txt_files_as_dict(os.path.join(dirpath,filename),'gas')
+
+                    file_dict = file_obj
 
                     # TODO: create datasets of compound data type to include variable/or column names and datetimestamps
                     h5file[group_name].create_group(filename)                    
@@ -469,13 +502,10 @@ def main_5505():
 
     inputfile_dir = '\\\\fs101\\5505\\People\\Juan\\TypicalBeamTime'
 
-    file_dict = g5505_file_reader.read_xps_ibw_file_as_dict(inputfile_dir+'\\SES\\0069069_N1s_495eV.ibw')
-    group_by_type = lambda x : utils.group_by_df_column(x,'filetype')
-
     select_file_keywords=[]
     select_dir_keywords = ['NEXAFS', 'Notes', 'Photos', 'Pressure', 'RGA', 'SES']
-    create_hdf5_file_from_filesystem_path('test_sls_data.h5',inputfile_dir,select_dir_keywords,select_file_keywords)
-    display_group_hierarchy_on_a_treemap('test_smog_chamber_v5.h5')
+    create_hdf5_file_from_filesystem_path('test_sls_data_v1.h5',inputfile_dir,select_dir_keywords,select_file_keywords)
+    display_group_hierarchy_on_a_treemap('test_sls_data_v1.h5')
 
     #create_hdf5_file('test', inputfile_dir, 'Topdown', [group_by_type], extract_attrs_func = None)
 
@@ -486,8 +516,8 @@ def main_smog_chamber():
     include_list = ['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26']
     select_date_list = ['20220726','2022.07.26']
 
-    create_hdf5_file_from_filesystem_path('test_smog_chamber_v5.h5',inputfile_dir,include_list,select_date_list)
-    display_group_hierarchy_on_a_treemap('test_smog_chamber_v5.h5')
+    create_hdf5_file_from_filesystem_path('test_smog_chamber_v6.h5',inputfile_dir,include_list,select_date_list)
+    display_group_hierarchy_on_a_treemap('test_smog_chamber_v6.h5')
 
 def main_mtable_h5_from_dataframe():
 
@@ -533,7 +563,9 @@ def main_mtable_h5_from_dataframe():
 
 if __name__ == '__main__':
 
-    main_mtable_h5_from_dataframe()
+    #main_smog_chamber()
+    #main_mtable_h5_from_dataframe()
+    main_5505()
 
     print(':)')