From 9c70fd643f2dc2d1001a526b93edb84f3981d407 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 28 Mar 2024 19:38:12 +0100
Subject: [PATCH 01/15] Refactored code in terms of subprocess for git
 functionality.

---
 src/metadata_review_lib.py | 74 +++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 20 deletions(-)

diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py
index c6657ca..4ba095d 100644
--- a/src/metadata_review_lib.py
+++ b/src/metadata_review_lib.py
@@ -105,7 +105,7 @@ def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs):
 
     
         
-def second_submit_metadata_review(filename_path, reviewer_attrs):
+def second_submit_metadata_review(review_yaml_file_path, reviewer_attrs):
     """
     Once you're done reviewing the yaml representation of hdf5 file in review folder. 
     Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
@@ -118,29 +118,63 @@ def second_submit_metadata_review(filename_path, reviewer_attrs):
 
     initials = reviewer_attrs['initials']
     branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
-
-    if any([status in get_review_status(filename_path) for status in ['under review','submitted']]):
-        filename_path_tail, filename_path_head = os.path.split(filename_path)
-        filename, ext = os.path.splitext(filename_path_head)
-        # TODO:    
-        with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'a') as f:
-            f.write('\nsubmitted')
-
     # TODO: replace with subprocess + git
     checkout_review_branch(repo_obj, branch_name)
 
-    status_dict = repo_obj.status()
-    for filepath, file_status in status_dict.items():
-        # Identify keys associated to review files and stage them
-        if ('review/'+filename in filepath) and (file_status == pygit.GIT_STATUS_WT_MODIFIED):
-            # Stage changes
-            repo_obj.index.add(filepath)
 
-    author = config_file.author #default_signature
-    committer = config_file.committer
-    message = "Submitted metadata review."
-    tree = repo_obj.index.write_tree()
-    oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
+    #if any([status in get_review_status(filename_path) for status in ['under review','submitted']]):
+    #    filename_path_tail, filename_path_head = os.path.split(filename_path)
+    #    filename, ext = os.path.splitext(filename_path_head)
+    #    # TODO:    
+
+
+    ##
+    status_command = ['git','status']
+    add_command = lambda add_list: ['git','add'] + add_list
+    commit_command = lambda message: ['git','commit','-m', message]
+    #push_command = lambda repository,refspec: ['git','push',repository,refspec]
+
+    status = subprocess.run(status_command,capture_output=True,check=True)
+
+    files_to_add_list = []
+    for line in status.stdout.splitlines():
+        # conver line from bytes to str
+        tmp = line.decode("utf-8")
+        if 'modified' in tmp and review_yaml_file_path in tmp:
+            files_to_add_list.append(tmp.split()[1])         
+    ##
+    
+    review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
+    filename, ext = os.path.splitext(review_yaml_file_path_head)
+    if files_to_add_list:
+        review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
+        with open(review_status_file_path,'a') as f:
+            f.write('\nsubmitted')
+
+        files_to_add_list.append(review_status_file_path)
+
+        result = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True)
+        message = 'Submitted metadata review.'
+        commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
+
+        for line in commit_output.stdout.splitlines():
+            print(line.decode('utf-8'))
+    else:
+        print('Nothing to commit.')
+
+
+    #status_dict = repo_obj.status()
+    #for filepath, file_status in status_dict.items():
+        # Identify keys associated to review files and stage them
+    #    if ('review/'+filename in filepath) and (file_status == pygit.GIT_STATUS_WT_MODIFIED):
+            # Stage changes
+    #        repo_obj.index.add(filepath)
+
+    #author = config_file.author #default_signature
+    #committer = config_file.committer
+    #message = "Submitted metadata review."
+    #tree = repo_obj.index.write_tree()
+    #oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
 
 
 

From 39cae669367efbee4be2fda387073fd687b31da8 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Tue, 2 Apr 2024 17:33:58 +0200
Subject: [PATCH 02/15] Implemented a two important changes. 1. filename of
 output file is not passed as input but it is automatically computed based on
 an input config_param dict. 2) input filenames in file system path are now
 filtered on an initial walk through the directory tree. This is to use stored
 path filenames for prunning directory tree, later on.

---
 src/hdf5_lib.py | 140 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 98 insertions(+), 42 deletions(-)

diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py
index 7126179..1810e67 100644
--- a/src/hdf5_lib.py
+++ b/src/hdf5_lib.py
@@ -17,6 +17,14 @@ import h5py
 import yaml
 
 
+def progressBar(count_value, total, suffix=''):
+    bar_length = 100
+    filled_up_Length = int(round(bar_length* count_value / float(total)))
+    percentage = round(100.0 * count_value/float(total),1)
+    bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
+    sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
+    sys.stdout.flush()
+
 def read_mtable_as_dataframe(filename):
 
     """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input .h5 file
@@ -204,15 +212,20 @@ def annotate_root_dir(filename,annotation_dict: dict):
     
 import shutil
 
-def create_hdf5_file_from_filesystem_path(ofilename : str, 
+def create_hdf5_file_from_filesystem_path(config_param : dict , 
                                           input_file_system_path : str, 
                                           select_dir_keywords = [], 
                                           select_file_keywords =[], 
                                           top_sub_dir_mask : bool = True):
+#def create_hdf5_file_from_filesystem_path(output_filename : str, 
+#                                          input_file_system_path : str, 
+#                                          select_dir_keywords = [], 
+#                                          select_file_keywords =[], 
+#                                          top_sub_dir_mask : bool = True):
 
     """
-    Creates an .h5 file with name ofilename that preserves the directory tree (or folder structure) of given a filesystem path and 
-    a few file and directory keywords. The keywords enable filtering of directories and files that do not contain the specified keywords.
+    Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) of given a filesystem path.
+    When file and directory keywords are non-empty, the keywords enable filtering of directories and files that do not contain the specified keywords.
 
     In the .h5 file, only files that are admissible file formats will be stored in the form of datasets and attributes.
 
@@ -237,6 +250,15 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
 
     # Ensure OS compliant paths and keywords
 
+    # TODO: validate config_param dict, make sure output_filename is a valid file_path
+    group_id = config_param['group_id']
+    user_initials = config_param['user_initials']
+    created_at = config_file.created_at()
+    output_dir = config_param['output_dir']
+    output_filename = output_dir + config_file.output_filename_tempate(group_id,created_at,user_initials)
+
+    admissible_file_ext_list = list(config_file.select_file_readers(group_id).keys())
+
     if '/' in input_file_system_path:
         input_file_system_path = input_file_system_path.replace('/',os.sep)
     else:
@@ -246,7 +268,7 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
         select_dir_keywords[i] = keyword.replace('/',os.sep)
 
 
-    with h5py.File(ofilename, 'w') as h5file:        
+    with h5py.File(output_filename, 'w') as h5file:        
 
         # Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower
         # level directories.
@@ -266,6 +288,26 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
 
             root_dir = input_file_system_path
 
+            # Create dictionary with directory-files pairs where files satisfy keyword and admisible type contraints
+            # It requires an extra pass over directory three and additional memory for dictionary, but it may be useful
+            # to speed up subsequent step and prune resulting directory tree.
+            file_paths_dict = {}
+            if select_file_keywords:                
+                for dirpath, _, filenames_list in os.walk(item,topdown=False):
+                    file_paths_dict[dirpath] = []
+                    for filename in filenames_list:
+
+                        if not any([ext in filename for ext in admissible_file_ext_list]):
+                            continue
+
+                        if any([keyword in filename for keyword in select_file_keywords]):
+                                file_paths_dict[dirpath].append(filename)
+
+            #admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys())
+            #for filename in filtered_filename_list.copy():
+            #    if not any([ext in filename for ext in admissible_file_ext_list]):
+            #        filtered_filename_list.remove(filename)
+
             for node_number, node in enumerate(os.walk(item, topdown=True)):
 
                 dirpath, dirnames, filenames_list = node
@@ -277,26 +319,24 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
                 # When select_file_keywords is an empty, i.e., [], do not apply any filter on the filenames.
                 
                 
-                filtered_filename_list = []
-                if select_file_keywords:                
-                    for filename in filenames_list:
-                        if any([keyword in filename for keyword in select_file_keywords]):
-                                filtered_filename_list.append(filename)
-                else:
-                    filtered_filename_list = filenames_list.copy()
+                #filtered_filename_list = []
+                #if select_file_keywords:                
+                #    for filename in filenames_list:
+                #        if any([keyword in filename for keyword in select_file_keywords]):
+                #                filtered_filename_list.append(filename)
+                #else:
+                #    filtered_filename_list = filenames_list.copy()
 
-                admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys())
-
-                for filename in filtered_filename_list.copy():
-                    if not any([ext in filename for ext in admissible_file_ext_list]):
-                        filtered_filename_list.remove(filename)
+                filtered_filename_list = file_paths_dict.get(dirpath,filenames_list.copy())
 
 
                 # Skip subdirectories that do not contain a keyword in the parameter 'select_dir_keywords' when it is nonempty    
                 if select_dir_keywords:
                     #if (dirpath.count(os.sep) > offset) and not any([item in dirpath for item in select_dir_keywords]):
+                    #tail, dirname = os.path.split(dirpath)
+                    #if not any([item in dirname for item in select_dir_keywords]):
                     if not any([item in dirpath for item in select_dir_keywords]):
-                            continue
+                        continue
 
                 group_name = dirpath.replace(os.sep,'/')
                 group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
@@ -308,46 +348,62 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
 
                 # TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)  
                 
-                for filename in filtered_filename_list:
+                for filenumber, filename in enumerate(filtered_filename_list):
                     
                     # Get file extension (or file type)
                     file_name, file_ext = os.path.splitext(filename)
 
+                    #print(filename)
+
                     #try: 
                     if not 'h5' in filename:
-                        file_dict = config_file.ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
+                        file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
 
                         if not file_dict:
                             continue
 
-                        # Create group and add their attributes
-                        h5file[group_name].create_group(name=file_dict['name'])
-                        for key in file_dict['attributes_dict'].keys():
-                            
-                            # Represent string values as fixed length strings in the HDF5 file, which need
-                            # to be decoded as string when we read them. It provides better control than variable strings,
-                            # at the expense of flexibility.
-                            # https://docs.h5py.org/en/stable/strings.html
-                            value = file_dict['attributes_dict'][key]
-                            if isinstance(value,str):
-                                utf8_type = h5py.string_dtype('utf-8', len(value))
-                                value = np.array(value.encode('utf-8'),dtype=utf8_type)
+                        try:
+                            # Create group and add their attributes
+                            h5file[group_name].create_group(name=file_dict['name'])
+                            for key in file_dict['attributes_dict'].keys():
+                                
+                                # Represent string values as fixed length strings in the HDF5 file, which need
+                                # to be decoded as string when we read them. It provides better control than variable strings,
+                                # at the expense of flexibility.
+                                # https://docs.h5py.org/en/stable/strings.html
+                                value = file_dict['attributes_dict'][key]
+                                if isinstance(value,str):
+                                    utf8_type = h5py.string_dtype('utf-8', len(value))
+                                    value = np.array(value.encode('utf-8'),dtype=utf8_type)
 
-                            h5file[group_name][file_dict['name']].attrs.create(name=key,
-                                                                               data=value)
-                            
-                        # Add datasets to just created group
-                        for dataset in file_dict['datasets']:
-                            h5file[group_name][file_dict['name']].create_dataset(name  = dataset['name'], 
-                                                data  = dataset['data'],
-                                                #dtype = file_dict['dtype'],
-                                                shape = dataset['shape'])
+                                h5file[group_name][file_dict['name']].attrs.create(name=key,
+                                                                                data=value)
+                                
+                            # Add datasets to just created group
+                            for dataset in file_dict['datasets']:
+                                h5file[group_name][file_dict['name']].create_dataset(name  = dataset['name'], 
+                                                    data  = dataset['data'],
+                                                    #dtype = file_dict['dtype'],
+                                                    shape = dataset['shape'])
+
+                        except Exception as inst:                            
+                            # TODO: log when a file could not be stored as a dataset
+                            print(inst) 
                         
                     else:
-                        config_file.ext_to_reader_dict[file_ext](source_file_path = os.path.join(dirpath,filename), 
+                        config_file.select_file_readers(group_id)[file_ext](source_file_path = os.path.join(dirpath,filename), 
                                                     dest_file_obj = h5file, 
                                                     dest_group_name = group_name +'/'+filename)
-                    print(file_ext, ':)')
+                        #print(filename,file_ext, ':)')
+                    
+                        
+                    progressBar(filenumber,len(filtered_filename_list), 'Uploading files in ' + dirpath)
+            
+
+    
+    output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
+
+    return output_filename, output_yml_filename_path
 
 
 

From f351f102b72b2b6d2e82af7378aa0330c7bdce02 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Tue, 2 Apr 2024 18:31:58 +0200
Subject: [PATCH 03/15] Commented out a print statement.

---
 src/hdf5_vis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hdf5_vis.py b/src/hdf5_vis.py
index ada825a..036f9bc 100644
--- a/src/hdf5_vis.py
+++ b/src/hdf5_vis.py
@@ -140,7 +140,7 @@ def print_metadata(name, obj, folder_depth, yaml_dict):
             #group_dict[obj.name]["name"] = obj.name
             #group_dict[obj.name]["attributes"] =  attr_dict
             #group_dict[obj.name]["datasets"] =  {}
-            print(name)
+            #print(name)
 
             yaml_dict[obj.name] = group_dict
         elif isinstance(obj, h5py.Dataset):            

From 9071120e50fd6643569180b03474fa0a9c6aa7f6 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Tue, 2 Apr 2024 18:35:04 +0200
Subject: [PATCH 04/15] Refactored code to read .dat and .txt files in binary
 mode first rb, then the prespecified encoding is used to decode the lines.
 This is to have more control over the decoding process and be able to better
 spot possible encoding errors.

---
 src/g5505_file_reader.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/g5505_file_reader.py b/src/g5505_file_reader.py
index 97abb1a..f069e63 100644
--- a/src/g5505_file_reader.py
+++ b/src/g5505_file_reader.py
@@ -93,10 +93,12 @@ def read_txt_files_as_dict(filename : str ):
         file_encoding = 'latin-1'
     elif 'ICAD' in filename and 'HONO' in filename:
         table_header = 'Start Date/Time (UTC)	Duration (s)	NO2 (ppb)	NO2 Uncertainty (ppb)	H2O (ppb)	H2O Uncertainty (ppb)	CHOCHO (ppb)	CHOCHO Uncertainty (ppb)	File Number	Light Intensity	#ICEDOAS iter.	Cell Pressure	Ambient Pressure	Cell Temp	Spec Temp	Lat	Lon	Height	Speed	GPSQuality	0-Air Ref. Time	0-Air Ref. Duration	0-Air Ref. File Number	0-Air Ref. Intensity	0-Air Ref. Rel Intensity	0-Air Ref. Intensity valid	MeasMode	SampleSource'
-        separator = '\t'        
+        separator = '\t' 
+        file_encoding = 'latin-1'       
     elif 'ICAD' in filename and 'NO2' in filename:
         table_header = 'Start Date/Time (UTC)	Duration (s)	NO2 (ppb)	NO2 Uncertainty (ppb)	H2O (ppb)	H2O Uncertainty (ppb)	CHOCHO (ppb)	CHOCHO Uncertainty (ppb)	File Number	Light Intensity	#ICEDOAS iter.	Cell Pressure	Ambient Pressure	Cell Temp	Spec Temp	Lat	Lon	Height	Speed	GPSQuality	0-Air Ref. Time	0-Air Ref. Duration	0-Air Ref. File Number	0-Air Ref. Intensity	0-Air Ref. Rel Intensity	0-Air Ref. Intensity valid	MeasMode	SampleSource'
         separator = '\t'
+        file_encoding = 'latin-1'
     else:
         return {}
         #raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
@@ -107,33 +109,32 @@ def read_txt_files_as_dict(filename : str ):
     # Work with copy of the file for safety
     tmp_filename = utils.make_file_copy(source_file_path=filename)
 
-    with open(tmp_filename,'r',encoding=file_encoding,errors='ignore') as f:
-        #file_encoding = f.encoding
-        #table_preamble = ""
+    #with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
+    with open(tmp_filename,'rb') as f:
         table_preamble = []
         for line_number, line in enumerate(f):        
             
-            if table_header in line:   
-                list_of_substrings = line.split(separator)             
+            if table_header in line.decode(file_encoding):   
+                list_of_substrings = line.decode(file_encoding).split(separator)             
                 data_start = True  
                 column_names = []
                 for i, name in enumerate(list_of_substrings):
                     column_names.append(str(i)+'_'+name) 
 
-                print(line_number, len(column_names ))
+                #print(line_number, len(column_names ),'\n')
                 break
             # Subdivide line into words, and join them by single space. 
             # I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
-            list_of_substrings = line.split()
+            list_of_substrings = line.decode(file_encoding).split()
             # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
             #line = ' '.join(list_of_substrings+['\n'])
-            line = ' '.join(list_of_substrings)     
-            table_preamble.append(line)# += new_line  
+            #line = ' '.join(list_of_substrings)     
+            table_preamble.append(' '.join(list_of_substrings))# += new_line  
 
         header_dict["table_preamble"] = table_preamble
 
    
-    # TODO: it does not work with separater as none :(. fix for RGA
+    # TODO: it does not work with separator as none :(. fix for RGA
     try:
         df = pd.read_csv(tmp_filename, 
                         delimiter = separator, 

From 9cde013be0aeb2c8dbdffcfac92f5ebab7414d9c Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Tue, 2 Apr 2024 18:48:50 +0200
Subject: [PATCH 05/15] Modified node values as the number of children of each
 group. When nodes are datasets, their value is 1.

---
 src/hdf5_lib.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py
index 1810e67..79ef5e0 100644
--- a/src/hdf5_lib.py
+++ b/src/hdf5_lib.py
@@ -158,10 +158,11 @@ def get_parent_child_relationships(file: h5py.File):
     parent = ['']
     #values = [file.attrs['count']]
     # TODO: maybe we should make this more general and not dependent on file_list attribute? 
-    if 'file_list' in file.attrs.keys():
-        values = [len(file.attrs['file_list'])]
-    else:
-        values = [1]
+    #if 'file_list' in file.attrs.keys():
+    #    values = [len(file.attrs['file_list'])]
+    #else:
+    #    values = [1]
+    values = [len(file.keys())]
 
     def node_visitor(name,obj):
         #if isinstance(obj,h5py.Group):
@@ -169,10 +170,12 @@ def get_parent_child_relationships(file: h5py.File):
             parent.append(obj.parent.name)
             #nodes.append(os.path.split(obj.name)[1])
             #parent.append(os.path.split(obj.parent.name)[1])
-            if isinstance(obj,h5py.Dataset) or not 'file_list' in obj.attrs.keys():
+            
+            if isinstance(obj,h5py.Dataset):# or not 'file_list' in obj.attrs.keys():
                 values.append(1)
             else:
-                values.append(len(obj.attrs['file_list']))
+                values.append(len(obj.keys()))
+                #values.append(len(obj.attrs['file_list']))
     file.visititems(node_visitor)
 
     return nodes, parent, values    

From f9b31c06fdfbe4d015d0a8c30968c58861d4d8a2 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Wed, 3 Apr 2024 13:49:16 +0200
Subject: [PATCH 06/15] Reimplemented file filtering, first file extension
 contraints are imposed and then file keyword contraints.

---
 src/hdf5_lib.py | 66 ++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py
index 79ef5e0..386d5a4 100644
--- a/src/hdf5_lib.py
+++ b/src/hdf5_lib.py
@@ -268,24 +268,23 @@ def create_hdf5_file_from_filesystem_path(config_param : dict ,
         raise  ValueError('input_file_system_path needs to be specified using forward slashes "/".' )
     
     for i, keyword in enumerate(select_dir_keywords):
-        select_dir_keywords[i] = keyword.replace('/',os.sep)
+        select_dir_keywords[i] = keyword.replace('/',os.sep)          
 
+    # Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower
+    # level directories.
 
-    with h5py.File(output_filename, 'w') as h5file:        
+    # Constrain walkable paths on the specified directory tree by allowing walks that start from root
+    # through subdirectories specified by dir_keywords. This improves efficiency especially, in deep
+    # directory trees with many leaves.
+    paths = []
+    if top_sub_dir_mask:
+        for item in os.listdir(input_file_system_path):
+            if any([item in keyword for keyword in select_dir_keywords]):
+                paths.append(os.path.join(input_file_system_path,item))
+    else:
+        paths.append(input_file_system_path)
 
-        # Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower
-        # level directories.
-
-        # Constrain walkable paths on the specified directory tree by allowing walks that start from root
-        # through subdirectories specified by dir_keywords. This improves efficiency especially, in deep
-        # directory trees with many leaves.
-        paths = []
-        if top_sub_dir_mask:
-            for item in os.listdir(input_file_system_path):
-                if any([item in keyword for keyword in select_dir_keywords]):
-                    paths.append(os.path.join(input_file_system_path,item))
-        else:
-            paths.append(input_file_system_path)
+    with h5py.File(output_filename, 'w') as h5file:
 
         for item in paths:
 
@@ -294,23 +293,34 @@ def create_hdf5_file_from_filesystem_path(config_param : dict ,
             # Create dictionary with directory-files pairs where files satisfy keyword and admisible type contraints
             # It requires an extra pass over directory three and additional memory for dictionary, but it may be useful
             # to speed up subsequent step and prune resulting directory tree.
-            file_paths_dict = {}
-            if select_file_keywords:                
-                for dirpath, _, filenames_list in os.walk(item,topdown=False):
-                    file_paths_dict[dirpath] = []
-                    for filename in filenames_list:
 
-                        if not any([ext in filename for ext in admissible_file_ext_list]):
-                            continue
+            # For each directory and/or subdirectory, keep files that satisfy file_keyword constraints, and store
+            # (directory_path, suitable files) relationships in a dictionary. 
+            file_paths_dict = {}  
 
+            check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
+
+            for dirpath, _, filenames in os.walk(item,topdown=False):
+                file_paths_dict[dirpath] = []
+
+                # Check files that have an admissible extension and store them in admissible_filenames list
+                admissible_filenames = []
+                for fn in filenames:
+                    if check_file_ext(fn):
+                        admissible_filenames.append(fn)
+
+                if select_file_keywords:  # when select_file_keywords = [], all files are considered    
+                    for filename in admissible_filenames:                        
+                        # Do not consider files with types for which there is still no file_reader. TODO: extend file_reader library.
+                        #if not any([ext in filename for ext in admissible_file_ext_list]):
+                        #    continue
+
+                        # Add files with name, that contains any of the file_keywords
                         if any([keyword in filename for keyword in select_file_keywords]):
                                 file_paths_dict[dirpath].append(filename)
-
-            #admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys())
-            #for filename in filtered_filename_list.copy():
-            #    if not any([ext in filename for ext in admissible_file_ext_list]):
-            #        filtered_filename_list.remove(filename)
-
+                else:
+                    file_paths_dict[dirpath] = admissible_filenames 
+    
             for node_number, node in enumerate(os.walk(item, topdown=True)):
 
                 dirpath, dirnames, filenames_list = node

From 5cd19979b6c4ea80f521e41326061d6dad5948e4 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Wed, 3 Apr 2024 13:51:21 +0200
Subject: [PATCH 07/15] Implemented first approach to data integration workflow

---
 workflow_data_integration.ipynb | 395 ++++++++++++--------------------
 1 file changed, 152 insertions(+), 243 deletions(-)

diff --git a/workflow_data_integration.ipynb b/workflow_data_integration.ipynb
index fc82d59..45dc92c 100644
--- a/workflow_data_integration.ipynb
+++ b/workflow_data_integration.ipynb
@@ -1,41 +1,55 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import python packages and modules"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "# Set up project root directory\n",
+    "root_dir = os.path.abspath(os.curdir)\n",
+    "sys.path.append(root_dir)\n",
+    "\n",
+    "import src.hdf5_vis as hdf5_vis\n",
+    "import src.hdf5_lib as hdf5_lib\n",
+    "import input_files.config_file as config_file\n",
+    "\n",
+    "\n",
+    "output_dir = 'output_files/'\n",
+    "group_id = '5505'#'smog_chamber'#'5505'\n",
+    "user_initials = 'LL' #'NG' #'LL' # 'TBR'\n",
+    "\n",
+    "group_id = 'smog_chamber'#'5505'\n",
+    "user_initials = 'NG'#'LL' #'NG' #'LL' # 'TBR'\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define input file path and keywords\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2 26\n",
-      ".dat :)\n",
-      "2 26\n",
-      ".dat :)\n",
-      "2 26\n",
-      ".dat :)\n",
-      "2 26\n",
-      ".dat :)\n",
-      "Humidity_Sensors\n",
-      "Humidity_Sensors/2022\n",
-      "Humidity_Sensors/2022/01_Januar\n",
-      "Humidity_Sensors/2022/02_Februar\n",
-      "Humidity_Sensors/2022/03_März\n",
-      "Humidity_Sensors/2022/04_April\n",
-      "Humidity_Sensors/2022/05_Mai\n",
-      "Humidity_Sensors/2022/06_Juni\n",
-      "Humidity_Sensors/2022/07_Juli\n",
-      "Humidity_Sensors/2022/10_Oktober\n",
-      "Humidity_Sensors/2022/11_November\n",
-      "Humidity_Sensors/2022/12_Dezember\n",
-      "ICAD\n",
-      "ICAD/HONO\n",
-      "ICAD/HONO/2022\n",
-      "ICAD/HONO_prototype\n",
-      "ICAD/HONO_prototype/2022\n",
-      "ICAD/NO2\n",
-      "ICAD/NO2/2022\n"
+      "[==================================================--------------------------------------------------] 50.0% ...Uploading files in \\\\fs03\\Iron_Sulphate\\smps\\20220726\r"
      ]
     },
     {
@@ -49,237 +63,113 @@
          "branchvalues": "remainder",
          "customdata": [
           "<br>",
-          "/Humidity_Sensors",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022/01_Januar",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/02_Februar",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/03_März",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/04_April",
-          "/Humidity_Sensors/2022/05_Mai",
-          "/Humidity_Sensors/2022/06_Juni",
-          "/Humidity_Sensors/2022/07_Juli",
-          "/Humidity_Sensors/2022/10_Oktober",
-          "/Humidity_Sensors/2022/11_November",
-          "/Humidity_Sensors/2022/12_Dezember",
-          "/ICAD",
-          "/ICAD/HONO",
-          "/ICAD/HONO/2022",
-          "/ICAD/HONO/2022/10_Oct",
-          "/ICAD/HONO/2022/11_Nov",
-          "/ICAD/HONO/2022/12_Dec",
-          "/ICAD/HONO_prototype",
-          "/ICAD/HONO_prototype/2022",
-          "/ICAD/HONO_prototype/2022/01_Jan",
-          "/ICAD/HONO_prototype/2022/02_Feb",
-          "/ICAD/NO2",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022/01_Jan",
-          "/ICAD/NO2/2022/02_Feb",
-          "/ICAD/NO2/2022/03_Mar",
-          "/ICAD/NO2/2022/04_Apr",
-          "/ICAD/NO2/2022/05_May",
-          "/ICAD/NO2/2022/06_June",
-          "/ICAD/NO2/2022/07_July",
-          "/ICAD/NO2/2022/10_Oct",
-          "/ICAD/NO2/2022/11_Nov",
-          "/ICAD/NO2/2022/12_Dec"
+          "/gas",
+          "/gas/20220726_000004_MSC_gases.txt",
+          "/gas/20220726_000004_MSC_gases.txt/categorial_variable_names",
+          "/gas/20220726_000004_MSC_gases.txt/categorical_variables",
+          "/gas/20220726_000004_MSC_gases.txt/numerical_variable_names",
+          "/gas/20220726_000004_MSC_gases.txt/numerical_variables",
+          "/gas/20220726_101617_MSC_gases.txt",
+          "/gas/20220726_101617_MSC_gases.txt/categorial_variable_names",
+          "/gas/20220726_101617_MSC_gases.txt/categorical_variables",
+          "/gas/20220726_101617_MSC_gases.txt/numerical_variable_names",
+          "/gas/20220726_101617_MSC_gases.txt/numerical_variables",
+          "/smps",
+          "/smps/20220726",
+          "/smps/20220726/20220726_mass.TXT",
+          "/smps/20220726/20220726_mass.TXT/categorial_variable_names",
+          "/smps/20220726/20220726_mass.TXT/categorical_variables",
+          "/smps/20220726/20220726_mass.TXT/numerical_variable_names",
+          "/smps/20220726/20220726_mass.TXT/numerical_variables",
+          "/smps/20220726/20220726_num.TXT",
+          "/smps/20220726/20220726_num.TXT/categorial_variable_names",
+          "/smps/20220726/20220726_num.TXT/categorical_variables",
+          "/smps/20220726/20220726_num.TXT/numerical_variable_names",
+          "/smps/20220726/20220726_num.TXT/numerical_variables"
          ],
          "hovertemplate": "<b>%{label} </b> <br> Count: %{value} <br> Path: %{customdata}",
          "labels": [
           "/",
-          "/Humidity_Sensors",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022/01_Januar",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/02_Februar",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/03_März",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/categorial_variable_names",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/categorical_variables",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/numerical_variable_names",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat/numerical_variables",
-          "/Humidity_Sensors/2022/04_April",
-          "/Humidity_Sensors/2022/05_Mai",
-          "/Humidity_Sensors/2022/06_Juni",
-          "/Humidity_Sensors/2022/07_Juli",
-          "/Humidity_Sensors/2022/10_Oktober",
-          "/Humidity_Sensors/2022/11_November",
-          "/Humidity_Sensors/2022/12_Dezember",
-          "/ICAD",
-          "/ICAD/HONO",
-          "/ICAD/HONO/2022",
-          "/ICAD/HONO/2022/10_Oct",
-          "/ICAD/HONO/2022/11_Nov",
-          "/ICAD/HONO/2022/12_Dec",
-          "/ICAD/HONO_prototype",
-          "/ICAD/HONO_prototype/2022",
-          "/ICAD/HONO_prototype/2022/01_Jan",
-          "/ICAD/HONO_prototype/2022/02_Feb",
-          "/ICAD/NO2",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022/01_Jan",
-          "/ICAD/NO2/2022/02_Feb",
-          "/ICAD/NO2/2022/03_Mar",
-          "/ICAD/NO2/2022/04_Apr",
-          "/ICAD/NO2/2022/05_May",
-          "/ICAD/NO2/2022/06_June",
-          "/ICAD/NO2/2022/07_July",
-          "/ICAD/NO2/2022/10_Oct",
-          "/ICAD/NO2/2022/11_Nov",
-          "/ICAD/NO2/2022/12_Dec"
+          "/gas",
+          "/gas/20220726_000004_MSC_gases.txt",
+          "/gas/20220726_000004_MSC_gases.txt/categorial_variable_names",
+          "/gas/20220726_000004_MSC_gases.txt/categorical_variables",
+          "/gas/20220726_000004_MSC_gases.txt/numerical_variable_names",
+          "/gas/20220726_000004_MSC_gases.txt/numerical_variables",
+          "/gas/20220726_101617_MSC_gases.txt",
+          "/gas/20220726_101617_MSC_gases.txt/categorial_variable_names",
+          "/gas/20220726_101617_MSC_gases.txt/categorical_variables",
+          "/gas/20220726_101617_MSC_gases.txt/numerical_variable_names",
+          "/gas/20220726_101617_MSC_gases.txt/numerical_variables",
+          "/smps",
+          "/smps/20220726",
+          "/smps/20220726/20220726_mass.TXT",
+          "/smps/20220726/20220726_mass.TXT/categorial_variable_names",
+          "/smps/20220726/20220726_mass.TXT/categorical_variables",
+          "/smps/20220726/20220726_mass.TXT/numerical_variable_names",
+          "/smps/20220726/20220726_mass.TXT/numerical_variables",
+          "/smps/20220726/20220726_num.TXT",
+          "/smps/20220726/20220726_num.TXT/categorial_variable_names",
+          "/smps/20220726/20220726_num.TXT/categorical_variables",
+          "/smps/20220726/20220726_num.TXT/numerical_variable_names",
+          "/smps/20220726/20220726_num.TXT/numerical_variables"
          ],
          "name": "",
          "parents": [
           "",
           "/",
-          "/Humidity_Sensors",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022/01_Januar",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/01_Januar/2022-01-31_09.37.56_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022/02_Februar",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_09.07.50_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/02_Februar/2022-02-11_16.46.26_PC10228_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022/03_März",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022/03_März/2022-03-14_09.05.01_PC14751_Humidity_Sensors.dat",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022",
-          "/Humidity_Sensors/2022",
+          "/gas",
+          "/gas/20220726_000004_MSC_gases.txt",
+          "/gas/20220726_000004_MSC_gases.txt",
+          "/gas/20220726_000004_MSC_gases.txt",
+          "/gas/20220726_000004_MSC_gases.txt",
+          "/gas",
+          "/gas/20220726_101617_MSC_gases.txt",
+          "/gas/20220726_101617_MSC_gases.txt",
+          "/gas/20220726_101617_MSC_gases.txt",
+          "/gas/20220726_101617_MSC_gases.txt",
           "/",
-          "/ICAD",
-          "/ICAD/HONO",
-          "/ICAD/HONO/2022",
-          "/ICAD/HONO/2022",
-          "/ICAD/HONO/2022",
-          "/ICAD",
-          "/ICAD/HONO_prototype",
-          "/ICAD/HONO_prototype/2022",
-          "/ICAD/HONO_prototype/2022",
-          "/ICAD",
-          "/ICAD/NO2",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022",
-          "/ICAD/NO2/2022"
+          "/smps",
+          "/smps/20220726",
+          "/smps/20220726/20220726_mass.TXT",
+          "/smps/20220726/20220726_mass.TXT",
+          "/smps/20220726/20220726_mass.TXT",
+          "/smps/20220726/20220726_mass.TXT",
+          "/smps/20220726",
+          "/smps/20220726/20220726_num.TXT",
+          "/smps/20220726/20220726_num.TXT",
+          "/smps/20220726/20220726_num.TXT",
+          "/smps/20220726/20220726_num.TXT"
          ],
          "root": {
           "color": "lightgrey"
          },
          "type": "treemap",
          "values": [
-          1,
-          1,
-          0,
-          5,
-          1,
-          1,
-          1,
-          1,
-          1,
-          14,
-          1,
-          1,
-          1,
-          1,
-          1,
-          1,
-          1,
-          1,
-          1,
-          1,
-          6,
-          1,
+          2,
+          2,
+          4,
           1,
           1,
           1,
           1,
           4,
-          9,
-          11,
-          3,
-          8,
-          17,
+          1,
+          1,
+          1,
+          1,
+          1,
           2,
-          1,
-          1,
-          0,
-          7,
-          8,
-          2,
-          1,
-          1,
-          3,
-          6,
-          1,
-          0,
-          3,
-          6,
-          5,
-          3,
           4,
-          6,
-          2,
-          5,
-          8,
-          2
+          1,
+          1,
+          1,
+          1,
+          4,
+          1,
+          1,
+          1,
+          1
          ]
         }
        ],
@@ -1116,18 +1006,37 @@
     }
    ],
    "source": [
-    "import sys\n",
-    "import os\n",
-    "root_dir = os.path.abspath(os.curdir)\n",
-    "sys.path.append(root_dir)\n",
+    "#input_file_dir = '//fs101/5505/People/Juan/TypicalBeamTime'\n",
+    "#select_file_keywords=[]\n",
+    "#select_dir_keywords = ['NEXAFS', 'Notes', 'Photos', 'Pressure', 'RGA', 'SES']\n",
     "\n",
-    "import src.hdf5_vis as hdf5_vis\n",
-    "import src.hdf5_lib as hdf5_lib\n",
     "\n",
-    "output_filename_path, output_yml_filename_path = hdf5_lib.main()\n",
+    "#input_file_dir  = '//fs101/5505/Data'    \n",
+    "#select_dir_keywords  = ['Lopap', 'Humidity_Sensors/2022', 'ICAD/HONO/2022', 'ICAD/NO2/2022', 'T200_NOX', 'T360U_CO2']\n",
+    "#select_file_keywords = ['2022-03-25','2022_03_25','20220325']\n",
+    "\n",
+    "\n",
+    "input_file_dir = '//fs03/Iron_Sulphate'\n",
+    "select_dir_keywords = ['gas','smps/20220726']#,'htof/2022.07.26','ptr/2022.07.26','ams/2022.07.26']\n",
+    "#select_dir_keywords = ['htof','ams', 'ptr', 'gas','smps']    \n",
+    "\n",
+    "select_file_keywords = ['20220726','2022.07.26']\n",
+    "\n",
+    "config_param = {'group_id' : group_id, 'user_initials' : user_initials, 'output_dir': output_dir}\n",
+    "\n",
+    "\n",
+    "output_filename_path, output_yml_filename_path = hdf5_lib.create_hdf5_file_from_filesystem_path(config_param,\n",
+    "                                                                                                input_file_dir,\n",
+    "                                                                                                select_dir_keywords,\n",
+    "                                                                                                select_file_keywords)\n",
     "\n",
     "hdf5_vis.display_group_hierarchy_on_a_treemap(output_filename_path)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {

From 719e9d66720d40576d6cab593bc218d1b789f1d9 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Wed, 3 Apr 2024 13:55:54 +0200
Subject: [PATCH 08/15] Repurposed the role of the config_file.py. Now it only
 provides functions to select the file_readers based on group id and produce a
 created_at timestamp.

---
 input_files/config_file.py | 91 ++++++++------------------------------
 1 file changed, 19 insertions(+), 72 deletions(-)

diff --git a/input_files/config_file.py b/input_files/config_file.py
index 7acd38e..6531151 100644
--- a/input_files/config_file.py
+++ b/input_files/config_file.py
@@ -14,85 +14,32 @@ import pygit2 as pygit
 author = pygit.Signature('Florez Ospina Juan Felipe', 'juan.florez-ospina@psi.ch')
 committer = pygit.Signature('Florez Ospina Juan Felipe', 'juan.florez-ospina@psi.ch')
 
-group_id = 'smog_chamber'
-
-#group_id = '5505'
 
 output_filename_tempate = lambda group_id, timestamp,user_initials : '_'.join(['unified_file',group_id,timestamp,user_initials])+'.h5'
 
+def created_at():
 
-now = datetime.now()
-# Populate now object with time zone infotmation obtained from the local system
-now_tz_aware = now.astimezone()
-tz = now_tz_aware.strftime('%z')
-#created_at = now_tz_aware.strftime('%Y-%m-%d_%H-%M-%S')+'_utcoffset_' + tz
-created_at = now_tz_aware.strftime('%Y-%m-%d')+'_UTC-OFST_' + tz
-# Make created at timestamp with tz information
-#created_at = now.isoformat()
-
-usecase = 1
-
-if usecase == 1:
-
-    group_id == 'smog_chamber'
-    user_initials = 'NG'
-    #from smog_chamber_file_reader import read_txt_files_as_dict 
-    #from g5505_file_reader import copy_file_in_group
-    #select_dir_keywords = ['htof','ams', 'ptr', 'gas','smps']    
-    inputfile_dir = '\\\\fs03\\Iron_Sulphate'
-    inputfile_dir = '//fs03/Iron_Sulphate'
-    #select_dir_keywords = ['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26']
-    select_dir_keywords = ['gas','smps/20220726']#,'htof/2022.07.26','ptr/2022.07.26','ams/2022.07.26']
-    select_file_keywords = ['20220726','2022.07.26']
-
-    outputfile_dir = 'output_files'
-
-    output_filename = output_filename_tempate(group_id,created_at,user_initials) #'test_smog_chamber_v14.h5'
-    #output_filename = 'unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5'
+    now = datetime.now()
+    # Populate now object with time zone infotmation obtained from the local system
+    now_tz_aware = now.astimezone()
+    tz = now_tz_aware.strftime('%z')
+    #created_at = now_tz_aware.strftime('%Y-%m-%d_%H-%M-%S')+'_utcoffset_' + tz
+    created_at = now_tz_aware.strftime('%Y-%m-%d')+'_UTC-OFST_' + tz
+    # Make created at timestamp with tz information
+    #created_at = now.isoformat()
+    return created_at
 
 
-    ext_to_reader_dict = {'.txt': scf_reader.read_txt_files_as_dict,
+def select_file_readers(group_id):
+    if group_id == '5505':
+        ext_to_reader_dict = {'.ibw': g5505f_reader.read_xps_ibw_file_as_dict,
+                          '.txt': g5505f_reader.read_txt_files_as_dict,
+                          '.dat': g5505f_reader.read_txt_files_as_dict,
+                          '.h5': g5505f_reader.copy_file_in_group}
+    elif group_id == 'smog_chamber':
+        ext_to_reader_dict = {'.txt': scf_reader.read_txt_files_as_dict,
                         '.TXT': scf_reader.read_txt_files_as_dict,
                         '.h5': g5505f_reader.copy_file_in_group}
-    
-elif usecase == 2 :
 
-    group_id == '5505'
-    user_initials = 'TBR'
-    outputfile_dir = 'output_files'
-    #output_filename = 'test_sls_data_v8.h5'
-    inputfile_dir = '//fs101/5505/People/Juan/TypicalBeamTime'
-    select_file_keywords=[]
-    select_dir_keywords = ['NEXAFS', 'Notes', 'Photos', 'Pressure', 'RGA', 'SES']
-
-    output_filename = output_filename_tempate(group_id,created_at,user_initials)
-    #output_filename = 'unified_file_5505_2024-03-19_UTC-OFST_+0100_TBR.h5'
-
-    ext_to_reader_dict = {'.ibw': g5505f_reader.read_xps_ibw_file_as_dict,
-                          '.txt': g5505f_reader.read_txt_files_as_dict,
-                          '.dat': g5505f_reader.read_txt_files_as_dict,
-                          '.h5': g5505f_reader.copy_file_in_group}
-elif usecase == 3:
-    user_initials = 'LL'
-    outputfile_dir = 'output_files'
-    output_filename = output_filename_tempate(group_id,created_at,user_initials)
-    
-    inputfile_dir = '//fs101/5505/Data'
-    
-    #select_dir_keywords = ['Lopap', 'Humidity_Sensors', 'ICAD/HONO', 'ICAD/NO2', 'T200_NOX', 'T360U_CO2']
-    # TODO: make sure in the code composite keywords are broken down into single keywords
-
-    ##select_dir_keywords = ['Humidity_Sensors','ICAD/HONO','ICAD/NO2']
-    select_dir_keywords = ['Humidity_Sensors/2022','ICAD/HONO/2022','ICAD/NO2/2022', '2022/01_Jan', '2022/02_Feb', '2022/03_März']
-    
-    dates = pd.read_excel(os.path.abspath(os.path.join('input_files','date_experiments_for Juan.xlsx')))
-
-    select_file_keywords=[item.strftime('%Y-%m-%d') for item in dates.loc[0:2,'experiment_date']]
-    select_file_keywords= select_file_keywords + [item.strftime('%Y%m%d') for item in dates.loc[0:2,'experiment_date']]
-
-    ext_to_reader_dict = {'.ibw': g5505f_reader.read_xps_ibw_file_as_dict,
-                          '.txt': g5505f_reader.read_txt_files_as_dict,
-                          '.dat': g5505f_reader.read_txt_files_as_dict,
-                          '.h5': g5505f_reader.copy_file_in_group}
-    
+    return ext_to_reader_dict
 

From 72e37ed277b9255dbebd912cb7d90739aa36fb4d Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 4 Apr 2024 09:18:36 +0200
Subject: [PATCH 09/15] Implemented jupyter notebooks for metadata review
 workflow excecution.

---
 workflow_data_owner_review.ipynb | 170 +++++++++++++++++++++++++++++++
 workflow_metadata_reviewer.ipynb |  18 ++++
 2 files changed, 188 insertions(+)
 create mode 100644 workflow_data_owner_review.ipynb
 create mode 100644 workflow_metadata_reviewer.ipynb

diff --git a/workflow_data_owner_review.ipynb b/workflow_data_owner_review.ipynb
new file mode 100644
index 0000000..0047702
--- /dev/null
+++ b/workflow_data_owner_review.ipynb
@@ -0,0 +1,170 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import libraries and modules\n",
+    "\n",
+    "* Excecute (or Run) Cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "root_dir = os.path.abspath(os.curdir)\n",
+    "sys.path.append(root_dir)\n",
+    "\n",
+    "import src.metadata_review_lib as metadata_review_lib"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 1: initialize metadata review.\n",
+    "\n",
+    "* Specify hdf5 file whose metadata is to be reviewed by editing the string variable `hdf5_file_path`.\n",
+    "* Edit reviewer attributes, i.e., the dict variable `reviewer_attrs` with your own initials and role.\n",
+    "* Excecute Cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Create branch metadata-review-by-NG\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "#hdf5_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.h5\"\n",
+    "#yml_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.yaml\"\n",
+    "\n",
+    "hdf5_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.h5\"\n",
+    "yml_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.yaml\"\n",
+    "\n",
+    "reviewer_attrs = {'initials': 'NG',\n",
+    "                  'type': 'data-owner'}\n",
+    "\n",
+    "#output_filename_path, output_yml_filename_path = hdf5_lib.main()\n",
+    "\n",
+    "review_yaml_file_path = metadata_review_lib.first_initialize_metadata_review(hdf5_file_path, reviewer_attrs)   \n",
+    "\n",
+    "print(review_yaml_file_path) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 2: Submit metadata review. \n",
+    "\n",
+    "* Edit yaml file in review folder and save changes\n",
+    "* Excecute Cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[data-owner-review-by-NG accb271] Submitted metadata review.\n",
+      " Committer: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>\n",
+      "Your name and email address were configured automatically based\n",
+      "on your username and hostname. Please check that they are accurate.\n",
+      "You can suppress this message by setting them explicitly. Run the\n",
+      "following command and follow the instructions in your editor to edit\n",
+      "your configuration file:\n",
+      "\n",
+      "    git config --global --edit\n",
+      "\n",
+      "After doing this, you may fix the identity used for this commit with:\n",
+      "\n",
+      "    git commit --amend --reset-author\n",
+      "\n",
+      " 2 files changed, 3 insertions(+), 3 deletions(-)\n"
+     ]
+    }
+   ],
+   "source": [
+    "metadata_review_lib.second_submit_metadata_review(review_yaml_file_path,reviewer_attrs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 3: Update hdf5 file metadata w/ submitted review yaml file.\n",
+    "\n",
+    "* Make sure previous step was carried out properly.\n",
+    "* Excecute Cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "metadata_review_lib.third_update_hdf5_file_with_review(hdf5_file_path, review_yaml_file_path, reviewer_attrs)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 4: Complete data-owner review. Update remote repository\n",
+    "\n",
+    "* Excecute Cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata_review_lib.fourth_complete_metadata_review(reviewer_attrs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test_atmos_chem_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/workflow_metadata_reviewer.ipynb b/workflow_metadata_reviewer.ipynb
new file mode 100644
index 0000000..709d82c
--- /dev/null
+++ b/workflow_metadata_reviewer.ipynb
@@ -0,0 +1,18 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 0417ac6debaaebd09353ac1ab7918d2fe3b3e6dd Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 4 Apr 2024 09:31:19 +0200
Subject: [PATCH 10/15] Modified hdf5 file path whose metadata is to be
 reviewed.

---
 workflow_data_owner_review.ipynb | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/workflow_data_owner_review.ipynb b/workflow_data_owner_review.ipynb
index 0047702..beb998e 100644
--- a/workflow_data_owner_review.ipynb
+++ b/workflow_data_owner_review.ipynb
@@ -40,11 +40,15 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Create branch metadata-review-by-NG\n",
-      "\n"
+     "ename": "ValueError",
+     "evalue": "metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) ",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[3], line 12\u001b[0m\n\u001b[0;32m      7\u001b[0m reviewer_attrs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minitials\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNG\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m      8\u001b[0m                   \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata-owner\u001b[39m\u001b[38;5;124m'\u001b[39m}\n\u001b[0;32m     10\u001b[0m \u001b[38;5;66;03m#output_filename_path, output_yml_filename_path = hdf5_lib.main()\u001b[39;00m\n\u001b[1;32m---> 12\u001b[0m review_yaml_file_path \u001b[38;5;241m=\u001b[39m \u001b[43mmetadata_review_lib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfirst_initialize_metadata_review\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhdf5_file_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreviewer_attrs\u001b[49m\u001b[43m)\u001b[49m   \n\u001b[0;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(review_yaml_file_path) \n",
+      "File \u001b[1;32mc:\\Users\\florez_j\\Documents\\GitLab\\functionspython\\src\\metadata_review_lib.py:69\u001b[0m, in \u001b[0;36mfirst_initialize_metadata_review\u001b[1;34m(hdf5_file_path, reviewer_attrs)\u001b[0m\n\u001b[0;32m     67\u001b[0m \u001b[38;5;66;03m# Verify if yaml snapshot of input h5 file exists     \u001b[39;00m\n\u001b[0;32m     68\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(hdf5_file_path_tail,filename\u001b[38;5;241m+\u001b[39mYAML_EXT)):\n\u001b[1;32m---> 69\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     71\u001b[0m \u001b[38;5;66;03m# Initialize metadata review workflow\u001b[39;00m\n\u001b[0;32m     72\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCreate branch metadata-review-by-\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m+\u001b[39minitials\u001b[38;5;241m+\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[1;31mValueError\u001b[0m: metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) "
      ]
     }
    ],
@@ -53,8 +57,8 @@
     "#hdf5_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.h5\"\n",
     "#yml_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.yaml\"\n",
     "\n",
-    "hdf5_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.h5\"\n",
-    "yml_file_path = \"output_files/unified_file_smog_chamber_2024-03-25_UTC-OFST_+0100_NG.yaml\"\n",
+    "hdf5_file_path = \"output_files/unified_file_smog_chamber_2024-04-03_UTC-OFST_+0200_NG.h5\"\n",
+    "yml_file_path = \"output_files/unified_file_smog_chamber_2024-04-03_UTC-OFST_+0200_NG.yaml\"\n",
     "\n",
     "reviewer_attrs = {'initials': 'NG',\n",
     "                  'type': 'data-owner'}\n",

From fa4fe691d0dde142abda44bfa7afc35f95cab3ae Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 4 Apr 2024 11:02:24 +0200
Subject: [PATCH 11/15] Refactored a few git statemets in terms of
 subprocess.run

---
 src/metadata_review_lib.py | 79 +++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py
index 4ba095d..33adc6d 100644
--- a/src/metadata_review_lib.py
+++ b/src/metadata_review_lib.py
@@ -35,16 +35,25 @@ def get_review_status(filename_path):
             workflow_steps.append(line)
     return workflow_steps[-1]
 
-def checkout_review_branch(repo_obj,branch_name):    
+def checkout_review_branch(branch_name):    
     # Create a new branch
     #branch_name = 'metadata-review-by-'+initials
     head_commit = repo_obj.head.peel()# Get the commit hash associated with HEAD
 
-    if not branch_name in repo_obj.branches:
-        branch = repo_obj.create_branch(branch_name, head_commit)
-    else:
-        branch = repo_obj.branches[branch_name]    
-    repo_obj.checkout(branch)
+    checkout_branch_command = lambda branch_name : ['git','checkout', branch_name]
+    output = subprocess.run(checkout_branch_command(branch_name), capture_output=True,text=True,check=True)
+
+    print(output.stdout)
+
+    #if not branch_name in repo_obj.branches:
+    #    branch = repo_obj.create_branch(branch_name, head_commit)
+    #else:
+    #    branch = repo_obj.branches[branch_name]    
+    #repo_obj.checkout(branch)
+    
+status_command = ['git','status']
+add_command = lambda add_list: ['git','add'] + add_list
+commit_command = lambda message: ['git','commit','-m', message]
 
 def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs):
 
@@ -70,34 +79,54 @@ def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs):
 
     # Initialize metadata review workflow
     print("Create branch metadata-review-by-"+initials+"\n")
+
+    checkout_review_branch(branch_name)
     
+
+    current_branch_command = ['git','branch','--show-current']
+    curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
+    if not branch_name in curr_branch.stdout:
+        print('Fail to checkout branch. ')
+        
     # Check if review file already exists and then check if it is still untracked
     review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
+
     if not os.path.exists(review_yaml_file_path):
         review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review') 
-    #else:
-    #    raise Warning("the file " + os.path.join("review/",filename+YAML_EXT)+ " already exists. Delete this file to reinitialize the metadata review process.")
-
 
     review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)
 
     with open(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt"),'w') as f:
         f.write('under review')  
 
-    checkout_review_branch(repo_obj, branch_name)
+    # Stage review files and commit them to local repository
+    status = subprocess.run(status_command,capture_output=True,text=True,check=True)
+    untracked_files_for_review = []
+    for line in status.stdout.splitlines():
+        if 'review/' in line.decode('utf8'):
+            untracked_files_for_review.append(line)
 
-    status_dict = repo_obj.status()
-    for filepath, file_status in status_dict.items():
+    result = subprocess.run(add_command(untracked_files_for_review),capture_output=True,check=True)
+    message = 'Initialized metadata review.'
+    commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
+
+    print(commit_output.stdout)
+
+
+
+
+    #status_dict = repo_obj.status()
+    #for filepath, file_status in status_dict.items():
         # Identify keys associated to review files and stage them
-        if 'review/'+filename in filepath:
+    #    if 'review/'+filename in filepath:
             # Stage changes
-            repo_obj.index.add(filepath)
+    #        repo_obj.index.add(filepath)
 
-    author = config_file.author #default_signature
-    committer = config_file.committer
-    message = "Initialized metadata review process."
-    tree = repo_obj.index.write_tree()
-    oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
+    #author = config_file.author #default_signature
+    #committer = config_file.committer
+    #message = "Initialized metadata review process."
+    #tree = repo_obj.index.write_tree()
+    #oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
 
     #print("Add and commit"+"\n") 
 
@@ -119,7 +148,13 @@ def second_submit_metadata_review(review_yaml_file_path, reviewer_attrs):
     initials = reviewer_attrs['initials']
     branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
     # TODO: replace with subprocess + git
-    checkout_review_branch(repo_obj, branch_name)
+    #checkout_review_branch(repo_obj, branch_name)
+
+    current_branch_command = ['git','branch','--show-current']
+    curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
+
+    if not branch_name in curr_branch.stdout:
+        raise ValueError('Make sure you run initial workflow step. If you are running the workflow using the jupyter notebook. Excecute Cell associated with Step 1.')
 
 
     #if any([status in get_review_status(filename_path) for status in ['under review','submitted']]):
@@ -129,9 +164,7 @@ def second_submit_metadata_review(review_yaml_file_path, reviewer_attrs):
 
 
     ##
-    status_command = ['git','status']
-    add_command = lambda add_list: ['git','add'] + add_list
-    commit_command = lambda message: ['git','commit','-m', message]
+
     #push_command = lambda repository,refspec: ['git','push',repository,refspec]
 
     status = subprocess.run(status_command,capture_output=True,check=True)

From 96c68f76146a61a276cf9de53b9ca781c056103d Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 4 Apr 2024 11:18:07 +0200
Subject: [PATCH 12/15] Added .ipynb files to gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 5de2c72..6909c5d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 *.pyc
 __pycache__/
 *.h5
-tmp_files/
\ No newline at end of file
+tmp_files/
+*.ipynb
\ No newline at end of file

From 2d5fecfb3437dc961453cd891bf53448d3f4ab9e Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 4 Apr 2024 12:56:37 +0200
Subject: [PATCH 13/15] Removed git checkout statements, to avoid conflicting
 changes of .ipybn files.

---
 src/metadata_review_lib.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py
index 33adc6d..6a5d59f 100644
--- a/src/metadata_review_lib.py
+++ b/src/metadata_review_lib.py
@@ -78,15 +78,15 @@ def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs):
         raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) ")
 
     # Initialize metadata review workflow
-    print("Create branch metadata-review-by-"+initials+"\n")
+    # print("Create branch metadata-review-by-"+initials+"\n")
 
-    checkout_review_branch(branch_name)
+    #checkout_review_branch(branch_name)
     
 
     current_branch_command = ['git','branch','--show-current']
     curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
     if not branch_name in curr_branch.stdout:
-        print('Fail to checkout branch. ')
+        raise ValueError("Please checkout the branch: "+branch_name+" via Git Bash Terminal while in the project's directory")
         
     # Check if review file already exists and then check if it is still untracked
     review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
@@ -154,7 +154,7 @@ def second_submit_metadata_review(review_yaml_file_path, reviewer_attrs):
     curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
 
     if not branch_name in curr_branch.stdout:
-        raise ValueError('Make sure you run initial workflow step. If you are running the workflow using the jupyter notebook. Excecute Cell associated with Step 1.')
+        raise ValueError('Make sure you are located at branch' + branch_name + '. Try to Excecute Cell with Step 1 first. ')
 
 
     #if any([status in get_review_status(filename_path) for status in ['under review','submitted']]):

From dd1f1245e394c41c848bf86d5515e10a3b748114 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 4 Apr 2024 12:58:17 +0200
Subject: [PATCH 14/15] Refactored comment lines.

---
 src/metadata_review_lib.py | 49 ++++++++++----------------------------
 1 file changed, 12 insertions(+), 37 deletions(-)

diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py
index 6a5d59f..cd85a37 100644
--- a/src/metadata_review_lib.py
+++ b/src/metadata_review_lib.py
@@ -50,7 +50,8 @@ def checkout_review_branch(branch_name):
     #else:
     #    branch = repo_obj.branches[branch_name]    
     #repo_obj.checkout(branch)
-    
+
+current_branch_command = ['git','branch','--show-current']    
 status_command = ['git','status']
 add_command = lambda add_list: ['git','add'] + add_list
 commit_command = lambda message: ['git','commit','-m', message]
@@ -82,7 +83,7 @@ def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs):
 
     #checkout_review_branch(branch_name)
     
-
+    # Check you are working at the right branch   
     current_branch_command = ['git','branch','--show-current']
     curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
     if not branch_name in curr_branch.stdout:
@@ -150,43 +151,31 @@ def second_submit_metadata_review(review_yaml_file_path, reviewer_attrs):
     # TODO: replace with subprocess + git
     #checkout_review_branch(repo_obj, branch_name)
 
-    current_branch_command = ['git','branch','--show-current']
+    # Check you are working at the right branch    
     curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
-
     if not branch_name in curr_branch.stdout:
-        raise ValueError('Make sure you are located at branch' + branch_name + '. Try to Excecute Cell with Step 1 first. ')
-
-
-    #if any([status in get_review_status(filename_path) for status in ['under review','submitted']]):
-    #    filename_path_tail, filename_path_head = os.path.split(filename_path)
-    #    filename, ext = os.path.splitext(filename_path_head)
-    #    # TODO:    
-
-
-    ##
-
-    #push_command = lambda repository,refspec: ['git','push',repository,refspec]
+        raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
 
+    # Collect modified review files
     status = subprocess.run(status_command,capture_output=True,check=True)
-
-    files_to_add_list = []
+    modified_files = []
     for line in status.stdout.splitlines():
         # conver line from bytes to str
         tmp = line.decode("utf-8")
         if 'modified' in tmp and review_yaml_file_path in tmp:
-            files_to_add_list.append(tmp.split()[1])         
-    ##
+            modified_files.append(tmp.split()[1])         
     
+    # Stage modified files and commit them to local repository    
     review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
     filename, ext = os.path.splitext(review_yaml_file_path_head)
-    if files_to_add_list:
+    if modified_files:
         review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
         with open(review_status_file_path,'a') as f:
             f.write('\nsubmitted')
 
-        files_to_add_list.append(review_status_file_path)
+        modified_files.append(review_status_file_path)
 
-        result = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True)
+        result = subprocess.run(add_command(modified_files),capture_output=True,check=True)
         message = 'Submitted metadata review.'
         commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
 
@@ -196,20 +185,6 @@ def second_submit_metadata_review(review_yaml_file_path, reviewer_attrs):
         print('Nothing to commit.')
 
 
-    #status_dict = repo_obj.status()
-    #for filepath, file_status in status_dict.items():
-        # Identify keys associated to review files and stage them
-    #    if ('review/'+filename in filepath) and (file_status == pygit.GIT_STATUS_WT_MODIFIED):
-            # Stage changes
-    #        repo_obj.index.add(filepath)
-
-    #author = config_file.author #default_signature
-    #committer = config_file.committer
-    #message = "Submitted metadata review."
-    #tree = repo_obj.index.write_tree()
-    #oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
-
-
 
 def third_complete_metadata_review(reviewer_attrs):
 

From 44d4a7b29be831ce3306a950ae9e584bcc44521c Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Thu, 4 Apr 2024 13:00:56 +0200
Subject: [PATCH 15/15] .ipybn

---
 workflow_data_owner_review.ipynb | 46 +++++++++++---------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/workflow_data_owner_review.ipynb b/workflow_data_owner_review.ipynb
index beb998e..a8fc237 100644
--- a/workflow_data_owner_review.ipynb
+++ b/workflow_data_owner_review.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,19 +36,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Create branch metadata-review-by-NG\n",
+      "\n"
+     ]
+    },
     {
      "ename": "ValueError",
-     "evalue": "metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) ",
+     "evalue": "('Please checkout the branch: ', 'data-owner-review-by-NG', \" via Git Bash Terminal while in the project's directory\")",
      "output_type": "error",
      "traceback": [
       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[3], line 12\u001b[0m\n\u001b[0;32m      7\u001b[0m reviewer_attrs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minitials\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNG\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m      8\u001b[0m                   \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata-owner\u001b[39m\u001b[38;5;124m'\u001b[39m}\n\u001b[0;32m     10\u001b[0m \u001b[38;5;66;03m#output_filename_path, output_yml_filename_path = hdf5_lib.main()\u001b[39;00m\n\u001b[1;32m---> 12\u001b[0m review_yaml_file_path \u001b[38;5;241m=\u001b[39m \u001b[43mmetadata_review_lib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfirst_initialize_metadata_review\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhdf5_file_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreviewer_attrs\u001b[49m\u001b[43m)\u001b[49m   \n\u001b[0;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(review_yaml_file_path) \n",
-      "File \u001b[1;32mc:\\Users\\florez_j\\Documents\\GitLab\\functionspython\\src\\metadata_review_lib.py:69\u001b[0m, in \u001b[0;36mfirst_initialize_metadata_review\u001b[1;34m(hdf5_file_path, reviewer_attrs)\u001b[0m\n\u001b[0;32m     67\u001b[0m \u001b[38;5;66;03m# Verify if yaml snapshot of input h5 file exists     \u001b[39;00m\n\u001b[0;32m     68\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(hdf5_file_path_tail,filename\u001b[38;5;241m+\u001b[39mYAML_EXT)):\n\u001b[1;32m---> 69\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) \u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     71\u001b[0m \u001b[38;5;66;03m# Initialize metadata review workflow\u001b[39;00m\n\u001b[0;32m     72\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCreate branch metadata-review-by-\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m+\u001b[39minitials\u001b[38;5;241m+\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
-      "\u001b[1;31mValueError\u001b[0m: metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) "
+      "Cell \u001b[1;32mIn[2], line 12\u001b[0m\n\u001b[0;32m      7\u001b[0m reviewer_attrs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minitials\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNG\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m      8\u001b[0m                   \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata-owner\u001b[39m\u001b[38;5;124m'\u001b[39m}\n\u001b[0;32m     10\u001b[0m \u001b[38;5;66;03m#output_filename_path, output_yml_filename_path = hdf5_lib.main()\u001b[39;00m\n\u001b[1;32m---> 12\u001b[0m review_yaml_file_path \u001b[38;5;241m=\u001b[39m \u001b[43mmetadata_review_lib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfirst_initialize_metadata_review\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhdf5_file_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreviewer_attrs\u001b[49m\u001b[43m)\u001b[49m   \n\u001b[0;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(review_yaml_file_path) \n",
+      "File \u001b[1;32mc:\\Users\\florez_j\\Documents\\GitLab\\functionspython\\src\\metadata_review_lib.py:89\u001b[0m, in \u001b[0;36mfirst_initialize_metadata_review\u001b[1;34m(hdf5_file_path, reviewer_attrs)\u001b[0m\n\u001b[0;32m     87\u001b[0m curr_branch \u001b[38;5;241m=\u001b[39m subprocess\u001b[38;5;241m.\u001b[39mrun(current_branch_command,capture_output\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,text\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,check\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m     88\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m branch_name \u001b[38;5;129;01min\u001b[39;00m curr_branch\u001b[38;5;241m.\u001b[39mstdout:\n\u001b[1;32m---> 89\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease checkout the branch: \u001b[39m\u001b[38;5;124m\"\u001b[39m,branch_name,\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m via Git Bash Terminal while in the project\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124ms directory\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     91\u001b[0m \u001b[38;5;66;03m# Check if review file already exists and then check if it is still untracked\u001b[39;00m\n\u001b[0;32m     92\u001b[0m review_yaml_file_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreview/\u001b[39m\u001b[38;5;124m\"\u001b[39m,filename\u001b[38;5;241m+\u001b[39mYAML_EXT)\n",
+      "\u001b[1;31mValueError\u001b[0m: ('Please checkout the branch: ', 'data-owner-review-by-NG', \" via Git Bash Terminal while in the project's directory\")"
      ]
     }
    ],
@@ -82,31 +90,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[data-owner-review-by-NG accb271] Submitted metadata review.\n",
-      " Committer: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>\n",
-      "Your name and email address were configured automatically based\n",
-      "on your username and hostname. Please check that they are accurate.\n",
-      "You can suppress this message by setting them explicitly. Run the\n",
-      "following command and follow the instructions in your editor to edit\n",
-      "your configuration file:\n",
-      "\n",
-      "    git config --global --edit\n",
-      "\n",
-      "After doing this, you may fix the identity used for this commit with:\n",
-      "\n",
-      "    git commit --amend --reset-author\n",
-      "\n",
-      " 2 files changed, 3 insertions(+), 3 deletions(-)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "metadata_review_lib.second_submit_metadata_review(review_yaml_file_path,reviewer_attrs)"
    ]