Synch with remote repo

2025-02-03 10:31:48 +01:00
parent a3ccff4079
commit 32bba4239a
102 changed files with 19584 additions and 19584 deletions
--- a/src/git_ops.py
+++ b/src/git_ops.py
@ -1,358 +1,358 @@
-import subprocess
-import os
-import utils.g5505_utils as utils
-from pipelines.metadata_revision import update_hdf5_file_with_review
-
-def perform_git_operations(hdf5_upload):
-    status_command = ['git', 'status']
-    status = subprocess.run(status_command, capture_output=True, check=True)
-
-    if hdf5_upload:
-        upload_ext = ['.h5', '.yaml']
-    else:
-        upload_ext = ['.yaml']
-
-    files_to_add_list = extract_files_to_add(status.stdout, upload_ext)
-    if files_to_add_list:
-        add_files_to_git(files_to_add_list)
-        commit_changes('Updated hdf5 file with yaml review file.')
-    else:
-        print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
-
-def extract_files_to_add(git_status_output, upload_ext):
-    files_to_add_list = []
-    for line in git_status_output.splitlines():
-        tmp = line.decode("utf-8")
-        if 'modified' in tmp:
-            if any(ext in tmp for ext in upload_ext):
-                files_to_add_list.append(tmp.split()[1])
-    return files_to_add_list
-
-def add_files_to_git(files_to_add_list):
-    add_command = ['git', 'add'] + files_to_add_list
-    subprocess.run(add_command, capture_output=True, check=True)
-
-def commit_changes(message):
-    commit_command = ['git', 'commit', '-m', message]
-    commit_output = subprocess.run(commit_command, capture_output=True, check=True)
-    print(commit_output.stdout)
-
-def get_status():
-    return subprocess.run(['git','status'],capture_output=True,text=True,check=True)
-
-def show_current_branch():
-    current_branch_command = ['git','branch','--show-current']    
-    subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
-
-
-
-YAML_EXT = ".yaml"
-TXT_EXT = ".txt"
-
-
-
-def get_review_status(filename_path):
-
-    filename_path_tail, filename_path_head = os.path.split(filename_path)
-    filename, ext = os.path.splitext(filename_path_head)
-    # TODO:    
-    with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
-        workflow_steps = []
-        for line in f:
-            workflow_steps.append(line)
-    return workflow_steps[-1]
-
-def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
-
-    """
-    First: Initialize review branch with review folder with a copy of yaml representation of
-    hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
-    
-    """
-
-    initials = reviewer_attrs['initials']
-    #branch_name = '-'.join([reviewer_attrs['type'],'review_',initials])
-    branch_name = '_'.join(['review',initials])
-
-    hdf5_file_path_tail, filename_path_head = os.path.split(hdf5_file_path)
-    filename, ext = os.path.splitext(filename_path_head)
-
-    # Check file_path points to h5 file
-    if not 'h5' in ext:
-        raise ValueError("filename_path needs to point to an h5 file.")
-
-    # Verify if yaml snapshot of input h5 file exists     
-    if not os.path.exists(os.path.join(hdf5_file_path_tail,filename+YAML_EXT)):
-        raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run serialize_metadata(filename_path) ")
-
-    # Initialize metadata review workflow
-    # print("Create branch metadata-review-by-"+initials+"\n")
-
-    #checkout_review_branch(branch_name)
-    
-    # Check you are working at the right branch   
-    
-    curr_branch = show_current_branch()
-    if not branch_name in curr_branch.stdout:
-        raise ValueError("Branch "+branch_name+" was not found. \nPlease open a Git Bash Terminal, and follow the below instructions: \n1. Change directory to your project's directory. \n2. Excecute the command: git checkout "+branch_name)
-        
-    # Check if review file already exists and then check if it is still untracked
-    review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
-    review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)    
-    review_status_yaml_file_path = os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")
-
-    if not os.path.exists(review_yaml_file_path) or restart:
-        review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review') 
-        if restart:
-            print('metadata review has been reinitialized. The review files will reflect the current state of the hdf5 files metadata')
-
-        
-
-    #if not os.path.exists(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")):
-        
-        with open(review_status_yaml_file_path,'w') as f:
-            f.write('under review')  
-
-    # Stage untracked review files and commit them to local repository
-    status = get_status()
-    untracked_files = []
-    for line in status.stdout.splitlines():
-        #tmp = line.decode("utf-8")
-            #modified_files.append(tmp.split()[1])    
-        if 'review/' in line:
-            if not 'modified' in line: # untracked filesand 
-                untracked_files.append(line.strip())
-            else: 
-                untracked_files.append(line.strip().split()[1])
-
-        if 'output_files/'+filename+YAML_EXT in line and not 'modified' in line:
-            untracked_files.append(line.strip())
-
-    if untracked_files:
-        result = subprocess.run(add_files_to_git(untracked_files),capture_output=True,check=True)
-        message = 'Initialized metadata review.'
-        commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
-
-        for line in commit_output.stdout.splitlines():
-            print(line.decode('utf-8'))
-    #else:
-    #    print('This action will not have any effect because metadata review process has been already initialized.')
-
-
-
-
-    #status_dict = repo_obj.status()
-    #for filepath, file_status in status_dict.items():
-        # Identify keys associated to review files and stage them
-    #    if 'review/'+filename in filepath:
-            # Stage changes
-    #        repo_obj.index.add(filepath)
-
-    #author = config_file.author #default_signature
-    #committer = config_file.committer
-    #message = "Initialized metadata review process."
-    #tree = repo_obj.index.write_tree()
-    #oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
-
-    #print("Add and commit"+"\n") 
-
-    return review_yaml_file_path, review_status_yaml_file_path
-
-    
-        
-def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
-    """
-    Second: Once you're done reviewing the yaml representation of hdf5 file in review folder. 
-    Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
-    running this function. 
-
-    """
-    # 1 verify review initializatin was performed first
-    # 2. change review status in txt to complete
-    # 3. git add review/ and git commit -m "Submitted metadata review"
-
-    initials = reviewer_attrs['initials']
-    #branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
-    branch_name = '_'.join(['review',initials])
-    # TODO: replace with subprocess + git
-    #checkout_review_branch(repo_obj, branch_name)
-
-    # Check you are working at the right branch    
-    curr_branch = show_current_branch()
-    if not branch_name in curr_branch.stdout:
-        raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
-
-    # Collect modified review files
-    status = get_status()
-    modified_files = []
-    os.path.basename(review_yaml_file_path)
-    for line in status.stdout.splitlines():
-        # conver line from bytes to str
-        tmp = line.decode("utf-8")
-        if 'modified' in tmp and 'review/' in tmp and os.path.basename(review_yaml_file_path) in tmp:
-            modified_files.append(tmp.split()[1])         
-    
-    # Stage modified files and commit them to local repository    
-    review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
-    filename, ext = os.path.splitext(review_yaml_file_path_head)
-    if modified_files:
-        review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
-        with open(review_status_file_path,'a') as f:
-            f.write('\nsubmitted')
-
-        modified_files.append(review_status_file_path)
-
-        result = subprocess.run(add_files_to_git(modified_files),capture_output=True,check=True)
-        message = 'Submitted metadata review.'
-        commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
-
-        for line in commit_output.stdout.splitlines():
-            print(line.decode('utf-8'))
-    else:
-        print('Nothing to commit.')
-
-#
-def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
-    if 'submitted' not in get_review_status(input_hdf5_file):
-        raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
-
-    update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
-    perform_git_operations(hdf5_upload)
-
-def last_submit_metadata_review(reviewer_attrs):
-
-    """Fourth: """
-
-    initials =reviewer_attrs['initials']
-
-    repository = 'origin'
-    branch_name = '_'.join(['review',initials])
-
-    push_command = lambda repository,refspec: ['git','push',repository,refspec]
-    
-    list_branches_command = ['git','branch','--list']
-
-    branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
-    if not branch_name in branches.stdout:
-        print('There is no branch named '+branch_name+'.\n')
-        print('Make sure to run data owner review workflow from the beginning without missing any steps.')
-        return
-
-    curr_branch = show_current_branch()
-    if not branch_name in curr_branch.stdout:
-        print('Complete metadata review could not be completed.\n')
-        print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
-        print('The step "Complete metadata review" will have no effect.')
-        return
-
-    
- 
-    # push
-    result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
-    print(result.stdout)
-
-    # 1. git add output_files/
-    # 2. delete review/
-    #shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
-    # 3. git rm review/
-    # 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
-    return result.returncode
-
-
-#import config_file
-#import hdf5_ops
-
-class MetadataHarvester:
-    def __init__(self, parent_files=None):
-        if parent_files is None:
-            parent_files = []
-        self.parent_files = parent_files
-        self.metadata = {
-            "project": {},
-            "sample": {},
-            "environment": {},
-            "instruments": {},
-            "datasets": {}
-        }
-
-    def add_project_info(self, key_or_dict, value=None, append=False):
-        self._add_info("project", key_or_dict, value, append)
-
-    def add_sample_info(self, key_or_dict, value=None, append=False):
-        self._add_info("sample", key_or_dict, value, append)
-
-    def add_environment_info(self, key_or_dict, value=None, append=False):
-        self._add_info("environment", key_or_dict, value, append)
-
-    def add_instrument_info(self, key_or_dict, value=None, append=False):
-        self._add_info("instruments", key_or_dict, value, append)
-
-    def add_dataset_info(self, key_or_dict, value=None, append=False):
-        self._add_info("datasets", key_or_dict, value, append)
-
-    def _add_info(self, category, key_or_dict, value, append):
-        """Internal helper method to add information to a category."""
-        if isinstance(key_or_dict, dict):
-            self.metadata[category].update(key_or_dict)
-        else:
-            if key_or_dict in self.metadata[category]:
-                if append:
-                    current_value = self.metadata[category][key_or_dict]
-
-                    if isinstance(current_value, list):
-
-                        if not isinstance(value, list):
-                            # Append the new value to the list
-                            self.metadata[category][key_or_dict].append(value)                           
-                        else:
-                            self.metadata[category][key_or_dict] = current_value + value
-
-                    elif isinstance(current_value, str):
-                        # Append the new value as a comma-separated string
-                        self.metadata[category][key_or_dict] = current_value + ',' + str(value)
-                    else:
-                        # Handle other types (for completeness, usually not required)
-                        self.metadata[category][key_or_dict] = [current_value, value]
-                else:
-                    self.metadata[category][key_or_dict] = value
-            else:
-                self.metadata[category][key_or_dict] = value
-
-    def get_metadata(self):
-        return {
-            "parent_files": self.parent_files,
-            "metadata": self.metadata
-        }
-    
-    def print_metadata(self):
-        print("parent_files", self.parent_files)
-
-        for key in self.metadata.keys():
-            print(key,'metadata:\n')
-            for item in self.metadata[key].items():
-                print(item[0],item[1])
-
-
-
-    def clear_metadata(self):
-        self.metadata = {
-            "project": {},
-            "sample": {},
-            "environment": {},
-            "instruments": {},
-            "datasets": {}
-        }
-        self.parent_files = []
-
-def main():    
-
-    output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
-    output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
-    output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
-    #output_yml_filename_path = hdf5_ops.serialize_metadata(output_filename_path)
-
-    #first_initialize_metadata_review(output_filename_path,initials='NG')
-    #second_submit_metadata_review()
-    #if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
-     #   third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
-    #fourth_complete_metadata_review()    
+import subprocess
+import os
+import utils.g5505_utils as utils
+from pipelines.metadata_revision import update_hdf5_file_with_review
+
+def perform_git_operations(hdf5_upload):
+    status_command = ['git', 'status']
+    status = subprocess.run(status_command, capture_output=True, check=True)
+
+    if hdf5_upload:
+        upload_ext = ['.h5', '.yaml']
+    else:
+        upload_ext = ['.yaml']
+
+    files_to_add_list = extract_files_to_add(status.stdout, upload_ext)
+    if files_to_add_list:
+        add_files_to_git(files_to_add_list)
+        commit_changes('Updated hdf5 file with yaml review file.')
+    else:
+        print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
+
+def extract_files_to_add(git_status_output, upload_ext):
+    files_to_add_list = []
+    for line in git_status_output.splitlines():
+        tmp = line.decode("utf-8")
+        if 'modified' in tmp:
+            if any(ext in tmp for ext in upload_ext):
+                files_to_add_list.append(tmp.split()[1])
+    return files_to_add_list
+
+def add_files_to_git(files_to_add_list):
+    add_command = ['git', 'add'] + files_to_add_list
+    subprocess.run(add_command, capture_output=True, check=True)
+
+def commit_changes(message):
+    commit_command = ['git', 'commit', '-m', message]
+    commit_output = subprocess.run(commit_command, capture_output=True, check=True)
+    print(commit_output.stdout)
+
+def get_status():
+    return subprocess.run(['git','status'],capture_output=True,text=True,check=True)
+
+def show_current_branch():
+    current_branch_command = ['git','branch','--show-current']    
+    subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
+
+
+
+YAML_EXT = ".yaml"
+TXT_EXT = ".txt"
+
+
+
+def get_review_status(filename_path):
+
+    filename_path_tail, filename_path_head = os.path.split(filename_path)
+    filename, ext = os.path.splitext(filename_path_head)
+    # TODO:    
+    with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
+        workflow_steps = []
+        for line in f:
+            workflow_steps.append(line)
+    return workflow_steps[-1]
+
+def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
+
+    """
+    First: Initialize review branch with review folder with a copy of yaml representation of
+    hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
+    
+    """
+
+    initials = reviewer_attrs['initials']
+    #branch_name = '-'.join([reviewer_attrs['type'],'review_',initials])
+    branch_name = '_'.join(['review',initials])
+
+    hdf5_file_path_tail, filename_path_head = os.path.split(hdf5_file_path)
+    filename, ext = os.path.splitext(filename_path_head)
+
+    # Check file_path points to h5 file
+    if not 'h5' in ext:
+        raise ValueError("filename_path needs to point to an h5 file.")
+
+    # Verify if yaml snapshot of input h5 file exists     
+    if not os.path.exists(os.path.join(hdf5_file_path_tail,filename+YAML_EXT)):
+        raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run serialize_metadata(filename_path) ")
+
+    # Initialize metadata review workflow
+    # print("Create branch metadata-review-by-"+initials+"\n")
+
+    #checkout_review_branch(branch_name)
+    
+    # Check you are working at the right branch   
+    
+    curr_branch = show_current_branch()
+    if not branch_name in curr_branch.stdout:
+        raise ValueError("Branch "+branch_name+" was not found. \nPlease open a Git Bash Terminal, and follow the below instructions: \n1. Change directory to your project's directory. \n2. Excecute the command: git checkout "+branch_name)
+        
+    # Check if review file already exists and then check if it is still untracked
+    review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
+    review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)    
+    review_status_yaml_file_path = os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")
+
+    if not os.path.exists(review_yaml_file_path) or restart:
+        review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review') 
+        if restart:
+            print('metadata review has been reinitialized. The review files will reflect the current state of the hdf5 files metadata')
+
+        
+
+    #if not os.path.exists(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")):
+        
+        with open(review_status_yaml_file_path,'w') as f:
+            f.write('under review')  
+
+    # Stage untracked review files and commit them to local repository
+    status = get_status()
+    untracked_files = []
+    for line in status.stdout.splitlines():
+        #tmp = line.decode("utf-8")
+            #modified_files.append(tmp.split()[1])    
+        if 'review/' in line:
+            if not 'modified' in line: # untracked filesand 
+                untracked_files.append(line.strip())
+            else: 
+                untracked_files.append(line.strip().split()[1])
+
+        if 'output_files/'+filename+YAML_EXT in line and not 'modified' in line:
+            untracked_files.append(line.strip())
+
+    if untracked_files:
+        result = subprocess.run(add_files_to_git(untracked_files),capture_output=True,check=True)
+        message = 'Initialized metadata review.'
+        commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
+
+        for line in commit_output.stdout.splitlines():
+            print(line.decode('utf-8'))
+    #else:
+    #    print('This action will not have any effect because metadata review process has been already initialized.')
+
+
+
+
+    #status_dict = repo_obj.status()
+    #for filepath, file_status in status_dict.items():
+        # Identify keys associated to review files and stage them
+    #    if 'review/'+filename in filepath:
+            # Stage changes
+    #        repo_obj.index.add(filepath)
+
+    #author = config_file.author #default_signature
+    #committer = config_file.committer
+    #message = "Initialized metadata review process."
+    #tree = repo_obj.index.write_tree()
+    #oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
+
+    #print("Add and commit"+"\n") 
+
+    return review_yaml_file_path, review_status_yaml_file_path
+
+    
+        
+def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
+    """
+    Second: Once you're done reviewing the yaml representation of hdf5 file in review folder. 
+    Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
+    running this function. 
+
+    """
+    # 1 verify review initializatin was performed first
+    # 2. change review status in txt to complete
+    # 3. git add review/ and git commit -m "Submitted metadata review"
+
+    initials = reviewer_attrs['initials']
+    #branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
+    branch_name = '_'.join(['review',initials])
+    # TODO: replace with subprocess + git
+    #checkout_review_branch(repo_obj, branch_name)
+
+    # Check you are working at the right branch    
+    curr_branch = show_current_branch()
+    if not branch_name in curr_branch.stdout:
+        raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
+
+    # Collect modified review files
+    status = get_status()
+    modified_files = []
+    os.path.basename(review_yaml_file_path)
+    for line in status.stdout.splitlines():
+        # conver line from bytes to str
+        tmp = line.decode("utf-8")
+        if 'modified' in tmp and 'review/' in tmp and os.path.basename(review_yaml_file_path) in tmp:
+            modified_files.append(tmp.split()[1])         
+    
+    # Stage modified files and commit them to local repository    
+    review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
+    filename, ext = os.path.splitext(review_yaml_file_path_head)
+    if modified_files:
+        review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
+        with open(review_status_file_path,'a') as f:
+            f.write('\nsubmitted')
+
+        modified_files.append(review_status_file_path)
+
+        result = subprocess.run(add_files_to_git(modified_files),capture_output=True,check=True)
+        message = 'Submitted metadata review.'
+        commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
+
+        for line in commit_output.stdout.splitlines():
+            print(line.decode('utf-8'))
+    else:
+        print('Nothing to commit.')
+
+#
+def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
+    if 'submitted' not in get_review_status(input_hdf5_file):
+        raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
+
+    update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
+    perform_git_operations(hdf5_upload)
+
+def last_submit_metadata_review(reviewer_attrs):
+
+    """Fourth: """
+
+    initials =reviewer_attrs['initials']
+
+    repository = 'origin'
+    branch_name = '_'.join(['review',initials])
+
+    push_command = lambda repository,refspec: ['git','push',repository,refspec]
+    
+    list_branches_command = ['git','branch','--list']
+
+    branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
+    if not branch_name in branches.stdout:
+        print('There is no branch named '+branch_name+'.\n')
+        print('Make sure to run data owner review workflow from the beginning without missing any steps.')
+        return
+
+    curr_branch = show_current_branch()
+    if not branch_name in curr_branch.stdout:
+        print('Complete metadata review could not be completed.\n')
+        print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
+        print('The step "Complete metadata review" will have no effect.')
+        return
+
+    
+ 
+    # push
+    result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
+    print(result.stdout)
+
+    # 1. git add output_files/
+    # 2. delete review/
+    #shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
+    # 3. git rm review/
+    # 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
+    return result.returncode
+
+
+#import config_file
+#import hdf5_ops
+
+class MetadataHarvester:
+    def __init__(self, parent_files=None):
+        if parent_files is None:
+            parent_files = []
+        self.parent_files = parent_files
+        self.metadata = {
+            "project": {},
+            "sample": {},
+            "environment": {},
+            "instruments": {},
+            "datasets": {}
+        }
+
+    def add_project_info(self, key_or_dict, value=None, append=False):
+        self._add_info("project", key_or_dict, value, append)
+
+    def add_sample_info(self, key_or_dict, value=None, append=False):
+        self._add_info("sample", key_or_dict, value, append)
+
+    def add_environment_info(self, key_or_dict, value=None, append=False):
+        self._add_info("environment", key_or_dict, value, append)
+
+    def add_instrument_info(self, key_or_dict, value=None, append=False):
+        self._add_info("instruments", key_or_dict, value, append)
+
+    def add_dataset_info(self, key_or_dict, value=None, append=False):
+        self._add_info("datasets", key_or_dict, value, append)
+
+    def _add_info(self, category, key_or_dict, value, append):
+        """Internal helper method to add information to a category."""
+        if isinstance(key_or_dict, dict):
+            self.metadata[category].update(key_or_dict)
+        else:
+            if key_or_dict in self.metadata[category]:
+                if append:
+                    current_value = self.metadata[category][key_or_dict]
+
+                    if isinstance(current_value, list):
+
+                        if not isinstance(value, list):
+                            # Append the new value to the list
+                            self.metadata[category][key_or_dict].append(value)                           
+                        else:
+                            self.metadata[category][key_or_dict] = current_value + value
+
+                    elif isinstance(current_value, str):
+                        # Append the new value as a comma-separated string
+                        self.metadata[category][key_or_dict] = current_value + ',' + str(value)
+                    else:
+                        # Handle other types (for completeness, usually not required)
+                        self.metadata[category][key_or_dict] = [current_value, value]
+                else:
+                    self.metadata[category][key_or_dict] = value
+            else:
+                self.metadata[category][key_or_dict] = value
+
+    def get_metadata(self):
+        return {
+            "parent_files": self.parent_files,
+            "metadata": self.metadata
+        }
+    
+    def print_metadata(self):
+        print("parent_files", self.parent_files)
+
+        for key in self.metadata.keys():
+            print(key,'metadata:\n')
+            for item in self.metadata[key].items():
+                print(item[0],item[1])
+
+
+
+    def clear_metadata(self):
+        self.metadata = {
+            "project": {},
+            "sample": {},
+            "environment": {},
+            "instruments": {},
+            "datasets": {}
+        }
+        self.parent_files = []
+
+def main():    
+
+    output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
+    output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
+    output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
+    #output_yml_filename_path = hdf5_ops.serialize_metadata(output_filename_path)
+
+    #first_initialize_metadata_review(output_filename_path,initials='NG')
+    #second_submit_metadata_review()
+    #if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
+     #   third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
+    #fourth_complete_metadata_review()    
--- a/src/hdf5_ops.py
+++ b/src/hdf5_ops.py
--- a/src/hdf5_writer.py
+++ b/src/hdf5_writer.py
@ -1,396 +1,396 @@
-import sys
-import os
-root_dir = os.path.abspath(os.curdir)
-sys.path.append(root_dir)
-
-import pandas as pd
-import numpy as np
-import h5py
-import logging
-
-import utils.g5505_utils as utils
-import instruments.readers.filereader_registry as filereader_registry
-
- 
-   
-def __transfer_file_dict_to_hdf5(h5file, group_name, file_dict):
-    """
-    Transfers data from a file_dict to an HDF5 file.
-
-    Parameters
-    ----------
-    h5file : h5py.File
-        HDF5 file object where the data will be written.
-    group_name : str
-        Name of the HDF5 group where data will be stored.
-    file_dict : dict
-        Dictionary containing file data to be transferred. Required structure:
-        {
-            'name': str,
-            'attributes_dict': dict,
-            'datasets': [
-                {
-                    'name': str,
-                    'data': array-like,
-                    'shape': tuple,
-                    'attributes': dict (optional)
-                },
-                ...
-            ]
-        }
-
-    Returns
-    -------
-    None
-    """
-
-    if not file_dict:
-        return
-
-    try:
-        # Create group and add their attributes
-        filename = file_dict['name']
-        group = h5file[group_name].create_group(name=filename)
-        # Add group attributes                                
-        group.attrs.update(file_dict['attributes_dict'])
-        
-        # Add datasets to the just created group
-        for dataset in file_dict['datasets']:
-            dataset_obj = group.create_dataset(
-                name=dataset['name'], 
-                data=dataset['data'],
-                shape=dataset['shape']
-            )
-            
-            # Add dataset's attributes                                
-            attributes = dataset.get('attributes', {})
-            dataset_obj.attrs.update(attributes)
-        group.attrs['last_update_date'] = utils.created_at().encode('utf-8')
-
-        stdout = f'Completed transfer for /{group_name}/{filename}'
-
-    except Exception as inst: 
-        stdout = inst
-        logging.error('Failed to transfer data into HDF5: %s', inst)
-
-    return stdout
-
-def __copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name, work_with_copy : bool = True):
-    # Create copy of original file to avoid possible file corruption and work with it.
-
-    if work_with_copy:
-        tmp_file_path = utils.make_file_copy(source_file_path)
-    else:
-        tmp_file_path = source_file_path
-
-    # Open backup h5 file and copy complet filesystem directory onto a group in h5file
-    with h5py.File(tmp_file_path,'r') as src_file:
-        dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
-
-    if 'tmp_files' in tmp_file_path:
-        os.remove(tmp_file_path)
-
-    stdout = f'Completed transfer for /{dest_group_name}'
-    return stdout
-
-def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, 
-                                          path_to_filenames_dict: dict = None,
-                                          select_dir_keywords : list = [],
-                                          root_metadata_dict : dict = {}, mode = 'w'):
-
-    """
-    Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
-    of a given filesystem path.
-
-    The data integration capabilities are limited by our file reader, which can only access data from a list of
-    admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
-    Files are formatted as composite objects consisting of a group, file, and attributes.
-
-    Parameters
-    ----------
-    output_filename : str
-        Name of the output HDF5 file.
-    path_to_input_directory : str
-        Path to root directory, specified with forward slashes, e.g., path/to/root.
-
-    path_to_filenames_dict : dict, optional
-        A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
-        If provided, 'input_file_system_path' is ignored.
-
-    select_dir_keywords : list
-        List of string elements to consider or select only directory paths that contain
-                                a word in 'select_dir_keywords'. When empty, all directory paths are considered
-                                to be included in the HDF5 file group hierarchy.
-    root_metadata_dict : dict
-        Metadata to include at the root level of the HDF5 file.
-
-    mode : str
-        'w' create File, truncate if it exists, or 'r+' read/write, File must exists. By default, mode = "w".
-
-    Returns
-    -------
-    output_filename : str
-        Path to the created HDF5 file.
-    """
-
-
-    if not mode in ['w','r+']:
-        raise ValueError(f'Parameter mode must take values in ["w","r+"]')
-    
-    if not '/' in path_to_input_directory:
-        raise  ValueError('path_to_input_directory needs to be specified using forward slashes "/".' )
-
-    #path_to_output_directory = os.path.join(path_to_input_directory,'..')
-    path_to_input_directory = os.path.normpath(path_to_input_directory).rstrip(os.sep)    
-
-    
-    for i, keyword in enumerate(select_dir_keywords):
-        select_dir_keywords[i] = keyword.replace('/',os.sep)          
-
-    if not path_to_filenames_dict:
-        # On dry_run=True, returns path to files dictionary of the output directory without making a actual copy of the input directory. 
-        # Therefore, there wont be a copying conflict by setting up input and output directories the same
-        path_to_filenames_dict = utils.copy_directory_with_contraints(input_dir_path=path_to_input_directory, 
-                                                                      output_dir_path=path_to_input_directory,
-                                                                      dry_run=True)
-    # Set input_directory as copied input directory
-    root_dir = path_to_input_directory
-    path_to_output_file = path_to_input_directory.rstrip(os.path.sep) + '.h5'
-
-    start_message = f'\n[Start] Data integration :\nSource: {path_to_input_directory}\nDestination: {path_to_output_file}\n'
-    
-    print(start_message)
-    logging.info(start_message)
-
-    # Check if the .h5 file already exists
-    if os.path.exists(path_to_output_file) and mode in ['w']:
-        message = (
-            f"[Notice] The file '{path_to_output_file}' already exists and will not be overwritten.\n"
-            "If you wish to replace it, please delete the existing file first and rerun the program."
-        )
-        print(message)
-        logging.error(message)
-    else:
-        with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
-
-            number_of_dirs = len(path_to_filenames_dict.keys())
-            dir_number = 1
-            for dirpath, filtered_filenames_list in path_to_filenames_dict.items():            
-            
-                # Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict. 
-                if not filtered_filenames_list:
-                    continue
-
-                group_name = dirpath.replace(os.sep,'/')
-                group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
-
-                # Flatten group name to one level
-                if select_dir_keywords:
-                    offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
-                else:
-                    offset = 1
-                tmp_list = group_name.split('/')
-                if len(tmp_list) > offset+1:
-                    group_name = '/'.join([tmp_list[i] for i in range(offset+1)])                
-
-                # Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes
-                if not group_name in h5file.keys():                    
-                    h5file.create_group(group_name)
-                    h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8')
-                    #h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list))
-                    #h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list))
-                #else:                           
-                    #print(group_name,' was already created.') 
-                    instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}'
-                    print(instFoldermsgStart)
-
-                for filenumber, filename in enumerate(filtered_filenames_list):
-                    
-                    #file_ext = os.path.splitext(filename)[1]
-                    #try: 
-
-                    # hdf5 path to filename group 
-                    dest_group_name = f'{group_name}/{filename}'
-
-                    if not 'h5' in filename:
-                        #file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
-                        #file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
-                        file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename))
-
-                        stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict)
-                        
-                    else:
-                        source_file_path = os.path.join(dirpath,filename)
-                        dest_file_obj = h5file
-                        #group_name +'/'+filename
-                        #ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name)
-                        #g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name)
-                        stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False)
-
-                # Update the progress bar and log the end message
-                instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n'
-                # Print and log the start message
-                utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd)
-                logging.info(instFoldermsdEnd )
-                dir_number = dir_number + 1
-
-            print('[End] Data integration')
-            logging.info('[End] Data integration')
-        
-            if len(root_metadata_dict.keys())>0:
-                for key, value in root_metadata_dict.items():
-                    #if key in h5file.attrs:
-                    #    del h5file.attrs[key]
-                    h5file.attrs.create(key, value)
-                #annotate_root_dir(output_filename,root_metadata_dict)  
-
-    
-    #output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
-
-    return path_to_output_file #, output_yml_filename_path
-
-def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None):
-    """
-    Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns.
-
-    Parameters:
-    -----------
-        ofilename (str): Path for the output HDF5 file.
-        input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path.
-        group_by_funcs (list): List of callables or column names to define hierarchical grouping.
-        approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file.
-        extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups.
-
-    Returns:
-    --------
-        None
-    """
-    # Check whether input_data is a valid file-system path or a DataFrame
-    is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False
-
-    if is_valid_path(input_data):
-        # If input_data is a file-system path, create a DataFrame with file info
-        file_list = os.listdir(input_data)
-        df = pd.DataFrame(file_list, columns=['filename'])
-        df = utils.augment_with_filetype(df)  # Add filetype information if needed
-    elif isinstance(input_data, pd.DataFrame):
-        # If input_data is a DataFrame, make a copy
-        df = input_data.copy()
-    else:
-        raise ValueError("input_data must be either a valid file-system path or a DataFrame.")
-
-    # Generate grouping columns based on group_by_funcs
-    if utils.is_callable_list(group_by_funcs):
-        grouping_cols = []
-        for i, func in enumerate(group_by_funcs):
-            col_name = f'level_{i}_groups'
-            grouping_cols.append(col_name)
-            df[col_name] = func(df)
-    elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
-        grouping_cols = group_by_funcs
-    else:
-        raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.")
-
-    # Generate group paths
-    df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)]
-
-    # Open the HDF5 file in write mode
-    with h5py.File(ofilename, 'w') as file:
-        for group_path in df['group_path'].unique():
-            # Create groups in HDF5
-            group = file.create_group(group_path)
-
-            # Filter the DataFrame for the current group
-            datatable = df[df['group_path'] == group_path].copy()
-
-            # Drop grouping columns and the generated 'group_path'
-            datatable = datatable.drop(columns=grouping_cols + ['group_path'])
-
-            # Add datasets to groups if data exists
-            if not datatable.empty:
-                dataset = utils.convert_dataframe_to_np_structured_array(datatable)
-                group.create_dataset(name='data_table', data=dataset)
-
-            # Add attributes if extract_attrs_func is provided
-            if extract_attrs_func:
-                attrs = extract_attrs_func(datatable)
-                for key, value in attrs.items():
-                    group.attrs[key] = value
-
-        # Save metadata about depth of hierarchy
-        file.attrs.create(name='depth', data=len(grouping_cols) - 1)
-
-    print(f"HDF5 file created successfully at {ofilename}")
-
-    return ofilename
-
-
-def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
-    """
-    Save processed dataframe columns with annotations to an HDF5 file.
-
-    Parameters:
-        df (pd.DataFrame): DataFrame containing processed time series.
-        annotator (): Annotator object with get_metadata method.
-        output_filename  (str): Path to the source HDF5 file.
-    """
-    # Convert datetime columns to string
-    datetime_cols = df.select_dtypes(include=['datetime64']).columns
-
-    if list(datetime_cols):
-        df[datetime_cols] = df[datetime_cols].map(str)
-
-    # Convert dataframe to structured array
-    icad_data_table = utils.convert_dataframe_to_np_structured_array(df)
-
-    # Get metadata
-    metadata_dict = annotator.get_metadata()
-
-    # Prepare project level attributes to be added at the root level
-
-    project_level_attributes = metadata_dict['metadata']['project']
-    
-    # Prepare high-level attributes
-    high_level_attributes = {
-        'parent_files': metadata_dict['parent_files'],
-        **metadata_dict['metadata']['sample'],
-        **metadata_dict['metadata']['environment'],
-        **metadata_dict['metadata']['instruments']
-    }
-
-    # Prepare data level attributes
-    data_level_attributes = metadata_dict['metadata']['datasets']
-
-    for key, value in data_level_attributes.items():
-        if isinstance(value,dict):
-            data_level_attributes[key] = utils.convert_attrdict_to_np_structured_array(value)
-
-
-    # Prepare file dictionary
-    file_dict = {
-        'name': project_level_attributes['processing_file'],
-        'attributes_dict': high_level_attributes,
-        'datasets': [{
-            'name': "data_table",
-            'data': icad_data_table,
-            'shape': icad_data_table.shape,
-            'attributes': data_level_attributes
-        }]
-    }
-
-    # Check if the file exists
-    if os.path.exists(output_filename):
-        mode = "a"
-        print(f"File {output_filename} exists. Opening in append mode.")        
-    else:
-        mode = "w"
-        print(f"File {output_filename} does not exist. Creating a new file.")
-
-
-    # Write to HDF5
-    with h5py.File(output_filename, mode) as h5file:
-        # Add project level attributes at the root/top level
-        h5file.attrs.update(project_level_attributes)
-        __transfer_file_dict_to_hdf5(h5file, '/', file_dict)
-
-#if __name__ == '__main__':
+import sys
+import os
+root_dir = os.path.abspath(os.curdir)
+sys.path.append(root_dir)
+
+import pandas as pd
+import numpy as np
+import h5py
+import logging
+
+import utils.g5505_utils as utils
+import instruments.readers.filereader_registry as filereader_registry
+
+ 
+   
+def __transfer_file_dict_to_hdf5(h5file, group_name, file_dict):
+    """
+    Transfers data from a file_dict to an HDF5 file.
+
+    Parameters
+    ----------
+    h5file : h5py.File
+        HDF5 file object where the data will be written.
+    group_name : str
+        Name of the HDF5 group where data will be stored.
+    file_dict : dict
+        Dictionary containing file data to be transferred. Required structure:
+        {
+            'name': str,
+            'attributes_dict': dict,
+            'datasets': [
+                {
+                    'name': str,
+                    'data': array-like,
+                    'shape': tuple,
+                    'attributes': dict (optional)
+                },
+                ...
+            ]
+        }
+
+    Returns
+    -------
+    None
+    """
+
+    if not file_dict:
+        return
+
+    try:
+        # Create group and add their attributes
+        filename = file_dict['name']
+        group = h5file[group_name].create_group(name=filename)
+        # Add group attributes                                
+        group.attrs.update(file_dict['attributes_dict'])
+        
+        # Add datasets to the just created group
+        for dataset in file_dict['datasets']:
+            dataset_obj = group.create_dataset(
+                name=dataset['name'], 
+                data=dataset['data'],
+                shape=dataset['shape']
+            )
+            
+            # Add dataset's attributes                                
+            attributes = dataset.get('attributes', {})
+            dataset_obj.attrs.update(attributes)
+        group.attrs['last_update_date'] = utils.created_at().encode('utf-8')
+
+        stdout = f'Completed transfer for /{group_name}/{filename}'
+
+    except Exception as inst: 
+        stdout = inst
+        logging.error('Failed to transfer data into HDF5: %s', inst)
+
+    return stdout
+
+def __copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name, work_with_copy : bool = True):
+    # Create copy of original file to avoid possible file corruption and work with it.
+
+    if work_with_copy:
+        tmp_file_path = utils.make_file_copy(source_file_path)
+    else:
+        tmp_file_path = source_file_path
+
+    # Open backup h5 file and copy complet filesystem directory onto a group in h5file
+    with h5py.File(tmp_file_path,'r') as src_file:
+        dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
+
+    if 'tmp_files' in tmp_file_path:
+        os.remove(tmp_file_path)
+
+    stdout = f'Completed transfer for /{dest_group_name}'
+    return stdout
+
+def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, 
+                                          path_to_filenames_dict: dict = None,
+                                          select_dir_keywords : list = [],
+                                          root_metadata_dict : dict = {}, mode = 'w'):
+
+    """
+    Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
+    of a given filesystem path.
+
+    The data integration capabilities are limited by our file reader, which can only access data from a list of
+    admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
+    Files are formatted as composite objects consisting of a group, file, and attributes.
+
+    Parameters
+    ----------
+    output_filename : str
+        Name of the output HDF5 file.
+    path_to_input_directory : str
+        Path to root directory, specified with forward slashes, e.g., path/to/root.
+
+    path_to_filenames_dict : dict, optional
+        A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
+        If provided, 'input_file_system_path' is ignored.
+
+    select_dir_keywords : list
+        List of string elements to consider or select only directory paths that contain
+                                a word in 'select_dir_keywords'. When empty, all directory paths are considered
+                                to be included in the HDF5 file group hierarchy.
+    root_metadata_dict : dict
+        Metadata to include at the root level of the HDF5 file.
+
+    mode : str
+        'w' create File, truncate if it exists, or 'r+' read/write, File must exists. By default, mode = "w".
+
+    Returns
+    -------
+    output_filename : str
+        Path to the created HDF5 file.
+    """
+
+
+    if not mode in ['w','r+']:
+        raise ValueError(f'Parameter mode must take values in ["w","r+"]')
+    
+    if not '/' in path_to_input_directory:
+        raise  ValueError('path_to_input_directory needs to be specified using forward slashes "/".' )
+
+    #path_to_output_directory = os.path.join(path_to_input_directory,'..')
+    path_to_input_directory = os.path.normpath(path_to_input_directory).rstrip(os.sep)    
+
+    
+    for i, keyword in enumerate(select_dir_keywords):
+        select_dir_keywords[i] = keyword.replace('/',os.sep)          
+
+    if not path_to_filenames_dict:
+        # On dry_run=True, returns path to files dictionary of the output directory without making a actual copy of the input directory. 
+        # Therefore, there wont be a copying conflict by setting up input and output directories the same
+        path_to_filenames_dict = utils.copy_directory_with_contraints(input_dir_path=path_to_input_directory, 
+                                                                      output_dir_path=path_to_input_directory,
+                                                                      dry_run=True)
+    # Set input_directory as copied input directory
+    root_dir = path_to_input_directory
+    path_to_output_file = path_to_input_directory.rstrip(os.path.sep) + '.h5'
+
+    start_message = f'\n[Start] Data integration :\nSource: {path_to_input_directory}\nDestination: {path_to_output_file}\n'
+    
+    print(start_message)
+    logging.info(start_message)
+
+    # Check if the .h5 file already exists
+    if os.path.exists(path_to_output_file) and mode in ['w']:
+        message = (
+            f"[Notice] The file '{path_to_output_file}' already exists and will not be overwritten.\n"
+            "If you wish to replace it, please delete the existing file first and rerun the program."
+        )
+        print(message)
+        logging.error(message)
+    else:
+        with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
+
+            number_of_dirs = len(path_to_filenames_dict.keys())
+            dir_number = 1
+            for dirpath, filtered_filenames_list in path_to_filenames_dict.items():            
+            
+                # Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict. 
+                if not filtered_filenames_list:
+                    continue
+
+                group_name = dirpath.replace(os.sep,'/')
+                group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
+
+                # Flatten group name to one level
+                if select_dir_keywords:
+                    offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
+                else:
+                    offset = 1
+                tmp_list = group_name.split('/')
+                if len(tmp_list) > offset+1:
+                    group_name = '/'.join([tmp_list[i] for i in range(offset+1)])                
+
+                # Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes
+                if not group_name in h5file.keys():                    
+                    h5file.create_group(group_name)
+                    h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8')
+                    #h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list))
+                    #h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list))
+                #else:                           
+                    #print(group_name,' was already created.') 
+                    instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}'
+                    print(instFoldermsgStart)
+
+                for filenumber, filename in enumerate(filtered_filenames_list):
+                    
+                    #file_ext = os.path.splitext(filename)[1]
+                    #try: 
+
+                    # hdf5 path to filename group 
+                    dest_group_name = f'{group_name}/{filename}'
+
+                    if not 'h5' in filename:
+                        #file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
+                        #file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
+                        file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename))
+
+                        stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict)
+                        
+                    else:
+                        source_file_path = os.path.join(dirpath,filename)
+                        dest_file_obj = h5file
+                        #group_name +'/'+filename
+                        #ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name)
+                        #g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name)
+                        stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False)
+
+                # Update the progress bar and log the end message
+                instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n'
+                # Print and log the start message
+                utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd)
+                logging.info(instFoldermsdEnd )
+                dir_number = dir_number + 1
+
+            print('[End] Data integration')
+            logging.info('[End] Data integration')
+        
+            if len(root_metadata_dict.keys())>0:
+                for key, value in root_metadata_dict.items():
+                    #if key in h5file.attrs:
+                    #    del h5file.attrs[key]
+                    h5file.attrs.create(key, value)
+                #annotate_root_dir(output_filename,root_metadata_dict)  
+
+    
+    #output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
+
+    return path_to_output_file #, output_yml_filename_path
+
+def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None):
+    """
+    Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns.
+
+    Parameters:
+    -----------
+        ofilename (str): Path for the output HDF5 file.
+        input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path.
+        group_by_funcs (list): List of callables or column names to define hierarchical grouping.
+        approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file.
+        extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups.
+
+    Returns:
+    --------
+        None
+    """
+    # Check whether input_data is a valid file-system path or a DataFrame
+    is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False
+
+    if is_valid_path(input_data):
+        # If input_data is a file-system path, create a DataFrame with file info
+        file_list = os.listdir(input_data)
+        df = pd.DataFrame(file_list, columns=['filename'])
+        df = utils.augment_with_filetype(df)  # Add filetype information if needed
+    elif isinstance(input_data, pd.DataFrame):
+        # If input_data is a DataFrame, make a copy
+        df = input_data.copy()
+    else:
+        raise ValueError("input_data must be either a valid file-system path or a DataFrame.")
+
+    # Generate grouping columns based on group_by_funcs
+    if utils.is_callable_list(group_by_funcs):
+        grouping_cols = []
+        for i, func in enumerate(group_by_funcs):
+            col_name = f'level_{i}_groups'
+            grouping_cols.append(col_name)
+            df[col_name] = func(df)
+    elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
+        grouping_cols = group_by_funcs
+    else:
+        raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.")
+
+    # Generate group paths
+    df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)]
+
+    # Open the HDF5 file in write mode
+    with h5py.File(ofilename, 'w') as file:
+        for group_path in df['group_path'].unique():
+            # Create groups in HDF5
+            group = file.create_group(group_path)
+
+            # Filter the DataFrame for the current group
+            datatable = df[df['group_path'] == group_path].copy()
+
+            # Drop grouping columns and the generated 'group_path'
+            datatable = datatable.drop(columns=grouping_cols + ['group_path'])
+
+            # Add datasets to groups if data exists
+            if not datatable.empty:
+                dataset = utils.convert_dataframe_to_np_structured_array(datatable)
+                group.create_dataset(name='data_table', data=dataset)
+
+            # Add attributes if extract_attrs_func is provided
+            if extract_attrs_func:
+                attrs = extract_attrs_func(datatable)
+                for key, value in attrs.items():
+                    group.attrs[key] = value
+
+        # Save metadata about depth of hierarchy
+        file.attrs.create(name='depth', data=len(grouping_cols) - 1)
+
+    print(f"HDF5 file created successfully at {ofilename}")
+
+    return ofilename
+
+
+def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
+    """
+    Save processed dataframe columns with annotations to an HDF5 file.
+
+    Parameters:
+        df (pd.DataFrame): DataFrame containing processed time series.
+        annotator (): Annotator object with get_metadata method.
+        output_filename  (str): Path to the source HDF5 file.
+    """
+    # Convert datetime columns to string
+    datetime_cols = df.select_dtypes(include=['datetime64']).columns
+
+    if list(datetime_cols):
+        df[datetime_cols] = df[datetime_cols].map(str)
+
+    # Convert dataframe to structured array
+    icad_data_table = utils.convert_dataframe_to_np_structured_array(df)
+
+    # Get metadata
+    metadata_dict = annotator.get_metadata()
+
+    # Prepare project level attributes to be added at the root level
+
+    project_level_attributes = metadata_dict['metadata']['project']
+    
+    # Prepare high-level attributes
+    high_level_attributes = {
+        'parent_files': metadata_dict['parent_files'],
+        **metadata_dict['metadata']['sample'],
+        **metadata_dict['metadata']['environment'],
+        **metadata_dict['metadata']['instruments']
+    }
+
+    # Prepare data level attributes
+    data_level_attributes = metadata_dict['metadata']['datasets']
+
+    for key, value in data_level_attributes.items():
+        if isinstance(value,dict):
+            data_level_attributes[key] = utils.convert_attrdict_to_np_structured_array(value)
+
+
+    # Prepare file dictionary
+    file_dict = {
+        'name': project_level_attributes['processing_file'],
+        'attributes_dict': high_level_attributes,
+        'datasets': [{
+            'name': "data_table",
+            'data': icad_data_table,
+            'shape': icad_data_table.shape,
+            'attributes': data_level_attributes
+        }]
+    }
+
+    # Check if the file exists
+    if os.path.exists(output_filename):
+        mode = "a"
+        print(f"File {output_filename} exists. Opening in append mode.")        
+    else:
+        mode = "w"
+        print(f"File {output_filename} does not exist. Creating a new file.")
+
+
+    # Write to HDF5
+    with h5py.File(output_filename, mode) as h5file:
+        # Add project level attributes at the root/top level
+        h5file.attrs.update(project_level_attributes)
+        __transfer_file_dict_to_hdf5(h5file, '/', file_dict)
+
+#if __name__ == '__main__':
--- a/src/openbis_lib.py
+++ b/src/openbis_lib.py
@ -1,270 +1,270 @@
-import pandas as pd
-import logging
-import os
-import datetime
-from pybis import Openbis
-import hidden
-
-admissible_props_list = ['$name', 'filenumber', 'default_experiment.experimental_results', 
-                         'dataquality', '$xmlcomments', '$annotations_state', 
-                         'sample_name', 'position_x', 'position_y', 'position_z', 'temp', 'cell_pressure', 'gas_flow_setting', 'sample_notes', 
-                         'beamline', 'photon_energy', 'slit_entrance_v', 'slit_exit_v', 'izero', 
-                         'slit_exit_h', 'hos', 'cone', 'endstation', 'hof', 
-                         'method_name', 'region', 'lens_mode', 'acq_mode', 'dwell_time', 'frames', 'passenergy', 
-                         'iterations', 'sequenceiterations', 'ke_range_center', 'ke_step']
-
-
-def initialize_openbis_obj():
-
-    # TODO: implement a more secure authentication method.
-    openbis_obj = Openbis('https://openbis-psi.labnotebook.ch/openbis/webapp/eln-lims/?menuUniqueId=null&viewName=showBlancPage&viewData=null', verify_certificates=False)
-    openbis_obj.login(hidden.username,hidden.password)
-
-    return openbis_obj
-
-def align_datetime_observation_windows(df_h5: pd.DataFrame, df_openbis: pd.DataFrame, h5_datetime_var: str = 'lastModifiedDatestr', ob_datetime_var: str = 'registrationDate') -> pd.DataFrame:
-
-    """ returns filtered/reduced versions of 'df' and 'df_ref' with aligned datetime observation windows. 
-    That is, the datetime variable range is the same for the returned dataframes."""
-    #""returns a filtered or reduced version of 'df' by removing all rows that are outside the datetime variable overlapping region between 'df' and 'df_ref'.
-    #"""
-
-    #df_h5['lastModifiedDatestr'] = df_h5['lastModifiedDatestr'].astype('datetime64[ns]')
-    #df_h5 = df_h5.sort_values(by='lastModifiedDatestr')
-
-    if not (h5_datetime_var in df_h5.columns.to_list() and ob_datetime_var in df_openbis.columns.to_list()):
-        #TODO: Check if ValueError is the best type of error to raise here 
-        raise ValueError("Dataframes 'df' and 'df_ref' must contain columns 'datetime_var' and 'datetime_var_ref', storing values in suitable datetime string format (e.g., yyyy-mm-dd hh:mm:ss).")
-    
-    df_h5[h5_datetime_var] = df_h5[h5_datetime_var].astype('datetime64[ns]')
-    df_openbis[ob_datetime_var] = df_openbis[ob_datetime_var].astype('datetime64[ns]')
-
-    min_timestamp = max([df_openbis[ob_datetime_var].min(), df_h5[h5_datetime_var].min()])
-    max_timestamp = min([df_openbis[ob_datetime_var].max(), df_h5[h5_datetime_var].max()])
-
-    # Determine overlap between df and df_ref, and filters out all rows from df with datetime variable outside the overlapping datetime region.
-    datetime_overlap_indicator = (df_h5[h5_datetime_var] >= min_timestamp) & (df_h5[h5_datetime_var] <= max_timestamp)
-    df_h5 = df_h5.loc[datetime_overlap_indicator,:]
-
-    datetime_overlap_indicator = (df_openbis[ob_datetime_var] >= min_timestamp) & (df_openbis[ob_datetime_var] <= max_timestamp)
-    df_openbis = df_openbis.loc[datetime_overlap_indicator,:]
-
-    df_h5 = df_h5.sort_values(by=h5_datetime_var)
-    df_openbis = df_openbis.sort_values(by=ob_datetime_var)
-
-    return df_h5, df_openbis
-
-def reformat_openbis_dataframe_filenumber(df_openbis):
-
-    if not 'FILENUMBER' in df_openbis.columns:        
-        raise ValueError('df_openbis does not contain the column "FILENUMBER". Make sure you query (e.g., o.get_samples(props=["filenumbe"])) that before creating df_openbis.')
-    #if not 'name' in df.columns:        
-    #    raise ValueError("df does not contain the column 'name'. Ensure df complies with Throsten's Table's format.")
- 
-    # Augment df_openbis with 'name' column consitent with Thorsten's naming convention
-    name_list = ['0' + item.zfill(3) + item.zfill(3) for item in df_openbis['FILENUMBER']] 
-    df_openbis['REFORMATED_FILENUMBER'] = pd.Series(name_list, index=df_openbis.index)
-
-    return df_openbis
-
-def pair_openbis_and_h5_dataframes(df_openbis, df_h5, pairing_ob_var: str, pairing_h5_var: str):
-
-    """ Pairs every row (or openbis sample) in 'df_openbis' with a set of rows (or measurements) in 'df_h5' by matching the i-th row in 'df_h5'
-    with the rows of 'df_h5' that satisfy the string df_openbis.loc[i,pairing_var_1] is contained in the string df_h5[i,pairing_var_2]
-
-    Example: pairing_var_1, pairing_var_2 = reformated 'REFORMATED_FILENUMBER', 'name' 
-
-    """
-    # Reformat openbis dataframe filenumber so that it can be used to find associated measurements in h5 dataframe
-    df_openbis = reformat_openbis_dataframe_filenumber(df_openbis)
-
-    related_indices_list = []
-    for sample_idx in df_openbis.index:
-        sample_value = df_openbis.loc[sample_idx,pairing_ob_var]
-        tmp_list = [sample_value  in item[0:item.find('_')] for item in df_h5[pairing_h5_var]]
-        related_indices_list.append(df_h5.index[tmp_list])
-
-        print('Paring openbis sample: ' + df_openbis.loc[sample_idx,pairing_ob_var])
-        print('with reformated FILENUMBER: ' + sample_value)
-        print('to following measurements in h5 dataframe:')
-        print(df_h5.loc[df_h5.index[tmp_list],'name'])
-        print('\n')
-
-    df_openbis['related_h5_indices'] = pd.Series(related_indices_list, index=df_openbis.index)
-
-    return df_openbis    
-
-
-def range_cols_2_string(df,lb_var,ub_var):
-
-    if not sum(df.loc[:,ub_var]-df.loc[:,lb_var])==0:
-        #tmp_list = ['-'.join([str(round(df.loc[i,lb_var],2)),str(round(df.loc[i,ub_var],1))]) for i in df.index]
-        tmp_list = ['-'.join(["{:.1f}".format(df.loc[i,lb_var]),"{:.1f}".format(df.loc[i,ub_var])]) for i in df.index]
-    elif len(df.loc[:,lb_var].unique())>1: # check if values are different
-        #tmp_list = [str(round(df.loc[i,lb_var],2)) for i in df.index]
-        tmp_list = ["{:.1f}".format(df.loc[i,lb_var]) for i in df.index]
-    else:
-        #tmp_list = [str(round(df.loc[0,lb_var],2))]
-        tmp_list = ["{:.1f}".format(df.loc[0,lb_var])]
-    return '/'.join(tmp_list)
-
-def col_2_string(df,column_var):
-
-    if not column_var in df.columns:
-        raise ValueError("'column var must belong in df.columns")
-    
-    #tmp_list = [str(round(item,1)) for item in df[column_var]]
-    tmp_list = ["{:.2f}".format(item) for item in df[column_var]]
-    if len(df[column_var].unique())==1:
-        tmp_list = [tmp_list[0]]
-
-    return '/'.join(tmp_list)
-
-
-def compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx):
-
-    prop2attr = {'sample_name':'sample', # ask Throsten whether this assignment is correct or not
-                 'position_x':'smplX_mm', 
-                 'position_y':'smplY_mm', 
-                 'position_z':'smplZ_mm',
-                 'temp':'sampleTemp_dC',
-                 'cell_pressure':'cellPressure_mbar',
-                         #'gas_flow_setting': '',
-                 'method_name':'regionName', # measurement type: XPS or NEXAFS
-                 'region':'regionName', # VB/N1s/C1s
-                 'passenergy':'regionName', # REAL    
-                    
-                 'photon_energy':'xRayEkinRange_eV',
-                 'dwell_time':'scientaDwellTime_ms',
-                 'acq_mode':'scientaAcquisitionMode',
-                 'ke_range_center':'scientaEkinRange_eV',
-                 'ke_step':'scientaEkinStep_eV',
-                 'lens_mode':'scientaLensMode'
-                }
-    
-    sample_identifier = df_openbis.loc[sample_idx,'identifier']
-    props_dict = {'FILENUMBER' : df_openbis.loc[sample_idx,'FILENUMBER']}
-
-    #props_dict = {}
-
-    if not len(df_openbis.loc[sample_idx,'related_h5_indices']):
-        props_dict['identifier'] = sample_identifier
-        return props_dict
-  
-    reduced_df_h5 = df_h5.loc[df_openbis.loc[sample_idx,'related_h5_indices'],:]
-    reduced_df_h5 = reduced_df_h5.reset_index()
-
-    # include related_samples key for validation purposes. Related samples are used to compute average and/or combined openbis properties.
-    related_sample_list = [reduced_df_h5['name'][index] for index in reduced_df_h5['name'].index]
-    related_samples = ' / '.join(related_sample_list)
-    props_dict['Subject_samples'] = related_samples
-
-    props_dict['sample_name'] = reduced_df_h5['sample'].unique()[0] if len(reduced_df_h5['sample'].unique())==1 else '/'.join(reduced_df_h5['sample'].tolist())
-    
-    if not 'NEXAFS' in reduced_df_h5['regionName'].iloc[0]:
-        props_dict['identifier'] = sample_identifier
-        props_dict['method_name'] = 'XPS'
-        for item_idx in reduced_df_h5.index:
-            item = reduced_df_h5.loc[item_idx,'regionName']
-            if item_idx > 0:
-                props_dict['region'] =  props_dict['region'] + '/' + item[0:item.find('_')]
-                #props_dict['dwell_time'] = props_dict['dwell_time'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
-                #props_dict['ke_range_center'] = props_dict['ke_range_center'] + '/' + str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
-                #props_dict['ke_step_center'] = props_dict['ke_step_center'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
-                #props_dict['passenergy'].append(float(item[item.find('_')+1:item.find('eV')]))
-            else:
-                props_dict['region'] =  item[0:item.find('_')]
-                #props_dict['dwell_time'] = str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
-                #props_dict['ke_range_center'] = str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
-                #props_dict['ke_step_center'] = str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
-
-        #props_dict['passenergy'] = reduced_df_h5.loc[:,'scientaPassEnergy_eV'].min()
-
-    else:
-        props_dict = {'identifier':sample_identifier,'method_name':'NEXAFS'}
-
-    
-    #props_dict['temp'] = round(reduced_df_h5['sampleTemp_dC'].mean(),2)
-    #props_dict['cell_pressure'] = round(reduced_df_h5['cellPressure_mbar'].mean(),2)
-    props_dict['temp'] = "{:.2f}".format(reduced_df_h5['sampleTemp_dC'].mean())
-    props_dict['cell_pressure'] = "{:.2f}".format(reduced_df_h5['cellPressure_mbar'].mean())
-
-    reduced_df_h5['scientaDwellTime_ms'] = reduced_df_h5['scientaDwellTime_ms']*1e-3 # covert ms to seconds
-    props_dict['dwell_time'] = col_2_string(reduced_df_h5,'scientaDwellTime_ms')
-    props_dict['passenergy'] = col_2_string(reduced_df_h5,'scientaPassEnergy_eV')
-    props_dict['ke_step_center'] = col_2_string(reduced_df_h5,'scientaEkinStep_eV') 
-    #props_dict['photon_energy'] =round(reduced_df_h5[['xRayEkinRange_eV_1','xRayEkinRange_eV_2']].mean(axis=1)[0],2)
-    props_dict['photon_energy'] = range_cols_2_string(reduced_df_h5,'xRayEkinRange_eV_1','xRayEkinRange_eV_2')
-    props_dict['ke_range_center'] = range_cols_2_string(reduced_df_h5,'scientaEkinRange_eV_1','scientaEkinRange_eV_2')
-
-    props_dict['lens_mode'] = reduced_df_h5['scientaLensMode'][0]
-    props_dict['acq_mode'] = reduced_df_h5['scientaAcquisitionMode'][0]
-
-    props_dict['position_x'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplX_mm'].mean()) # round(reduced_df_h5.loc[:,'smplX_mm'].mean(),2)
-    props_dict['position_y'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplY_mm'].mean())
-    props_dict['position_z'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplZ_mm'].mean())
-
-
-
-    return props_dict
-
-
-
-def single_sample_update(sample_props_dict,sample_collection,props_include_list):
-
-    """ Updates sample in openbis database specified in sample_props_dict, which must belong in sample_collection (i.e., result of openbis_obj.get_samples(...)) """
-
-    try:
-        sample_path_identifier = sample_props_dict['identifier'] #path-like index
-        sample =   sample_collection[sample_path_identifier]
-        for prop in sample_props_dict.keys():
-            if (prop in admissible_props_list) and (prop in props_include_list):
-                sample.props[prop] = sample_props_dict[prop]
-        sample.save() 
-    except Exception:
-        logging.error(Exception)
-
-    return 0
-
-
-def sample_batch_update(openbis_obj,sample_collection,df_openbis,df_h5,props_include_list):
-
-    """ See """
-
-    if not 'related_h5_indices' in df_openbis.columns:
-        raise ValueError("Input dataframe 'df_openbis' must contain a column named 'related_h5_indeces', resulting from suitable proprocessing steps.")
-
-    # TODO: as a safeguard, create exclude list containing properties that must not be changed 
-    exclude_list = ['filenumber','FILENUMBER','identifier']
-    for item in props_include_list:
-        if item in exclude_list:
-            props_include_list.remove(item)
-
-    trans = openbis_obj.new_transaction()
-    for sample_idx in len(range(df_openbis['identifier'])):
-
-        props_dict = compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx)
-        sample_path_identifier = props_dict['identifier'] #path-like index
-        sample =   sample_collection[sample_path_identifier]
-
-        for prop in props_dict.keys():
-            if prop in props_include_list:
-                sample.props[prop] = props_dict[prop]
-
-        trans.add(sample)
-
-    trans.commit()
-
-    return 0
-
-def conduct_dataframe_preprocessing_steps(df_h5, df_openbis):
-        
-    if not 'lastModifiedDatestr'in df_h5.columns:
-        raise ValueError('')
-
-    df_h5, df_openbis = align_datetime_observation_windows(df_h5, df_openbis, 'lastModifiedDatestr' , 'registrationDate')  
-    df_openbis = pair_openbis_and_h5_dataframes(df_openbis, df_h5, 'REFORMATED_FILENUMBER', 'name')
-
-    return df_h5, df_openbis
-
-
+import pandas as pd
+import logging
+import os
+import datetime
+from pybis import Openbis
+import hidden
+
+admissible_props_list = ['$name', 'filenumber', 'default_experiment.experimental_results', 
+                         'dataquality', '$xmlcomments', '$annotations_state', 
+                         'sample_name', 'position_x', 'position_y', 'position_z', 'temp', 'cell_pressure', 'gas_flow_setting', 'sample_notes', 
+                         'beamline', 'photon_energy', 'slit_entrance_v', 'slit_exit_v', 'izero', 
+                         'slit_exit_h', 'hos', 'cone', 'endstation', 'hof', 
+                         'method_name', 'region', 'lens_mode', 'acq_mode', 'dwell_time', 'frames', 'passenergy', 
+                         'iterations', 'sequenceiterations', 'ke_range_center', 'ke_step']
+
+
+def initialize_openbis_obj():
+
+    # TODO: implement a more secure authentication method.
+    openbis_obj = Openbis('https://openbis-psi.labnotebook.ch/openbis/webapp/eln-lims/?menuUniqueId=null&viewName=showBlancPage&viewData=null', verify_certificates=False)
+    openbis_obj.login(hidden.username,hidden.password)
+
+    return openbis_obj
+
+def align_datetime_observation_windows(df_h5: pd.DataFrame, df_openbis: pd.DataFrame, h5_datetime_var: str = 'lastModifiedDatestr', ob_datetime_var: str = 'registrationDate') -> pd.DataFrame:
+
+    """ returns filtered/reduced versions of 'df' and 'df_ref' with aligned datetime observation windows. 
+    That is, the datetime variable range is the same for the returned dataframes."""
+    #""returns a filtered or reduced version of 'df' by removing all rows that are outside the datetime variable overlapping region between 'df' and 'df_ref'.
+    #"""
+
+    #df_h5['lastModifiedDatestr'] = df_h5['lastModifiedDatestr'].astype('datetime64[ns]')
+    #df_h5 = df_h5.sort_values(by='lastModifiedDatestr')
+
+    if not (h5_datetime_var in df_h5.columns.to_list() and ob_datetime_var in df_openbis.columns.to_list()):
+        #TODO: Check if ValueError is the best type of error to raise here 
+        raise ValueError("Dataframes 'df' and 'df_ref' must contain columns 'datetime_var' and 'datetime_var_ref', storing values in suitable datetime string format (e.g., yyyy-mm-dd hh:mm:ss).")
+    
+    df_h5[h5_datetime_var] = df_h5[h5_datetime_var].astype('datetime64[ns]')
+    df_openbis[ob_datetime_var] = df_openbis[ob_datetime_var].astype('datetime64[ns]')
+
+    min_timestamp = max([df_openbis[ob_datetime_var].min(), df_h5[h5_datetime_var].min()])
+    max_timestamp = min([df_openbis[ob_datetime_var].max(), df_h5[h5_datetime_var].max()])
+
+    # Determine overlap between df and df_ref, and filters out all rows from df with datetime variable outside the overlapping datetime region.
+    datetime_overlap_indicator = (df_h5[h5_datetime_var] >= min_timestamp) & (df_h5[h5_datetime_var] <= max_timestamp)
+    df_h5 = df_h5.loc[datetime_overlap_indicator,:]
+
+    datetime_overlap_indicator = (df_openbis[ob_datetime_var] >= min_timestamp) & (df_openbis[ob_datetime_var] <= max_timestamp)
+    df_openbis = df_openbis.loc[datetime_overlap_indicator,:]
+
+    df_h5 = df_h5.sort_values(by=h5_datetime_var)
+    df_openbis = df_openbis.sort_values(by=ob_datetime_var)
+
+    return df_h5, df_openbis
+
+def reformat_openbis_dataframe_filenumber(df_openbis):
+
+    if not 'FILENUMBER' in df_openbis.columns:        
+        raise ValueError('df_openbis does not contain the column "FILENUMBER". Make sure you query (e.g., o.get_samples(props=["filenumbe"])) that before creating df_openbis.')
+    #if not 'name' in df.columns:        
+    #    raise ValueError("df does not contain the column 'name'. Ensure df complies with Throsten's Table's format.")
+ 
+    # Augment df_openbis with 'name' column consitent with Thorsten's naming convention
+    name_list = ['0' + item.zfill(3) + item.zfill(3) for item in df_openbis['FILENUMBER']] 
+    df_openbis['REFORMATED_FILENUMBER'] = pd.Series(name_list, index=df_openbis.index)
+
+    return df_openbis
+
+def pair_openbis_and_h5_dataframes(df_openbis, df_h5, pairing_ob_var: str, pairing_h5_var: str):
+
+    """ Pairs every row (or openbis sample) in 'df_openbis' with a set of rows (or measurements) in 'df_h5' by matching the i-th row in 'df_h5'
+    with the rows of 'df_h5' that satisfy the string df_openbis.loc[i,pairing_var_1] is contained in the string df_h5[i,pairing_var_2]
+
+    Example: pairing_var_1, pairing_var_2 = reformated 'REFORMATED_FILENUMBER', 'name' 
+
+    """
+    # Reformat openbis dataframe filenumber so that it can be used to find associated measurements in h5 dataframe
+    df_openbis = reformat_openbis_dataframe_filenumber(df_openbis)
+
+    related_indices_list = []
+    for sample_idx in df_openbis.index:
+        sample_value = df_openbis.loc[sample_idx,pairing_ob_var]
+        tmp_list = [sample_value  in item[0:item.find('_')] for item in df_h5[pairing_h5_var]]
+        related_indices_list.append(df_h5.index[tmp_list])
+
+        print('Paring openbis sample: ' + df_openbis.loc[sample_idx,pairing_ob_var])
+        print('with reformated FILENUMBER: ' + sample_value)
+        print('to following measurements in h5 dataframe:')
+        print(df_h5.loc[df_h5.index[tmp_list],'name'])
+        print('\n')
+
+    df_openbis['related_h5_indices'] = pd.Series(related_indices_list, index=df_openbis.index)
+
+    return df_openbis    
+
+
+def range_cols_2_string(df,lb_var,ub_var):
+
+    if not sum(df.loc[:,ub_var]-df.loc[:,lb_var])==0:
+        #tmp_list = ['-'.join([str(round(df.loc[i,lb_var],2)),str(round(df.loc[i,ub_var],1))]) for i in df.index]
+        tmp_list = ['-'.join(["{:.1f}".format(df.loc[i,lb_var]),"{:.1f}".format(df.loc[i,ub_var])]) for i in df.index]
+    elif len(df.loc[:,lb_var].unique())>1: # check if values are different
+        #tmp_list = [str(round(df.loc[i,lb_var],2)) for i in df.index]
+        tmp_list = ["{:.1f}".format(df.loc[i,lb_var]) for i in df.index]
+    else:
+        #tmp_list = [str(round(df.loc[0,lb_var],2))]
+        tmp_list = ["{:.1f}".format(df.loc[0,lb_var])]
+    return '/'.join(tmp_list)
+
+def col_2_string(df,column_var):
+
+    if not column_var in df.columns:
+        raise ValueError("'column var must belong in df.columns")
+    
+    #tmp_list = [str(round(item,1)) for item in df[column_var]]
+    tmp_list = ["{:.2f}".format(item) for item in df[column_var]]
+    if len(df[column_var].unique())==1:
+        tmp_list = [tmp_list[0]]
+
+    return '/'.join(tmp_list)
+
+
+def compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx):
+
+    prop2attr = {'sample_name':'sample', # ask Throsten whether this assignment is correct or not
+                 'position_x':'smplX_mm', 
+                 'position_y':'smplY_mm', 
+                 'position_z':'smplZ_mm',
+                 'temp':'sampleTemp_dC',
+                 'cell_pressure':'cellPressure_mbar',
+                         #'gas_flow_setting': '',
+                 'method_name':'regionName', # measurement type: XPS or NEXAFS
+                 'region':'regionName', # VB/N1s/C1s
+                 'passenergy':'regionName', # REAL    
+                    
+                 'photon_energy':'xRayEkinRange_eV',
+                 'dwell_time':'scientaDwellTime_ms',
+                 'acq_mode':'scientaAcquisitionMode',
+                 'ke_range_center':'scientaEkinRange_eV',
+                 'ke_step':'scientaEkinStep_eV',
+                 'lens_mode':'scientaLensMode'
+                }
+    
+    sample_identifier = df_openbis.loc[sample_idx,'identifier']
+    props_dict = {'FILENUMBER' : df_openbis.loc[sample_idx,'FILENUMBER']}
+
+    #props_dict = {}
+
+    if not len(df_openbis.loc[sample_idx,'related_h5_indices']):
+        props_dict['identifier'] = sample_identifier
+        return props_dict
+  
+    reduced_df_h5 = df_h5.loc[df_openbis.loc[sample_idx,'related_h5_indices'],:]
+    reduced_df_h5 = reduced_df_h5.reset_index()
+
+    # include related_samples key for validation purposes. Related samples are used to compute average and/or combined openbis properties.
+    related_sample_list = [reduced_df_h5['name'][index] for index in reduced_df_h5['name'].index]
+    related_samples = ' / '.join(related_sample_list)
+    props_dict['Subject_samples'] = related_samples
+
+    props_dict['sample_name'] = reduced_df_h5['sample'].unique()[0] if len(reduced_df_h5['sample'].unique())==1 else '/'.join(reduced_df_h5['sample'].tolist())
+    
+    if not 'NEXAFS' in reduced_df_h5['regionName'].iloc[0]:
+        props_dict['identifier'] = sample_identifier
+        props_dict['method_name'] = 'XPS'
+        for item_idx in reduced_df_h5.index:
+            item = reduced_df_h5.loc[item_idx,'regionName']
+            if item_idx > 0:
+                props_dict['region'] =  props_dict['region'] + '/' + item[0:item.find('_')]
+                #props_dict['dwell_time'] = props_dict['dwell_time'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
+                #props_dict['ke_range_center'] = props_dict['ke_range_center'] + '/' + str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
+                #props_dict['ke_step_center'] = props_dict['ke_step_center'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
+                #props_dict['passenergy'].append(float(item[item.find('_')+1:item.find('eV')]))
+            else:
+                props_dict['region'] =  item[0:item.find('_')]
+                #props_dict['dwell_time'] = str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
+                #props_dict['ke_range_center'] = str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
+                #props_dict['ke_step_center'] = str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
+
+        #props_dict['passenergy'] = reduced_df_h5.loc[:,'scientaPassEnergy_eV'].min()
+
+    else:
+        props_dict = {'identifier':sample_identifier,'method_name':'NEXAFS'}
+
+    
+    #props_dict['temp'] = round(reduced_df_h5['sampleTemp_dC'].mean(),2)
+    #props_dict['cell_pressure'] = round(reduced_df_h5['cellPressure_mbar'].mean(),2)
+    props_dict['temp'] = "{:.2f}".format(reduced_df_h5['sampleTemp_dC'].mean())
+    props_dict['cell_pressure'] = "{:.2f}".format(reduced_df_h5['cellPressure_mbar'].mean())
+
+    reduced_df_h5['scientaDwellTime_ms'] = reduced_df_h5['scientaDwellTime_ms']*1e-3 # covert ms to seconds
+    props_dict['dwell_time'] = col_2_string(reduced_df_h5,'scientaDwellTime_ms')
+    props_dict['passenergy'] = col_2_string(reduced_df_h5,'scientaPassEnergy_eV')
+    props_dict['ke_step_center'] = col_2_string(reduced_df_h5,'scientaEkinStep_eV') 
+    #props_dict['photon_energy'] =round(reduced_df_h5[['xRayEkinRange_eV_1','xRayEkinRange_eV_2']].mean(axis=1)[0],2)
+    props_dict['photon_energy'] = range_cols_2_string(reduced_df_h5,'xRayEkinRange_eV_1','xRayEkinRange_eV_2')
+    props_dict['ke_range_center'] = range_cols_2_string(reduced_df_h5,'scientaEkinRange_eV_1','scientaEkinRange_eV_2')
+
+    props_dict['lens_mode'] = reduced_df_h5['scientaLensMode'][0]
+    props_dict['acq_mode'] = reduced_df_h5['scientaAcquisitionMode'][0]
+
+    props_dict['position_x'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplX_mm'].mean()) # round(reduced_df_h5.loc[:,'smplX_mm'].mean(),2)
+    props_dict['position_y'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplY_mm'].mean())
+    props_dict['position_z'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplZ_mm'].mean())
+
+
+
+    return props_dict
+
+
+
+def single_sample_update(sample_props_dict,sample_collection,props_include_list):
+
+    """ Updates sample in openbis database specified in sample_props_dict, which must belong in sample_collection (i.e., result of openbis_obj.get_samples(...)) """
+
+    try:
+        sample_path_identifier = sample_props_dict['identifier'] #path-like index
+        sample =   sample_collection[sample_path_identifier]
+        for prop in sample_props_dict.keys():
+            if (prop in admissible_props_list) and (prop in props_include_list):
+                sample.props[prop] = sample_props_dict[prop]
+        sample.save() 
+    except Exception:
+        logging.error(Exception)
+
+    return 0
+
+
+def sample_batch_update(openbis_obj,sample_collection,df_openbis,df_h5,props_include_list):
+
+    """ See """
+
+    if not 'related_h5_indices' in df_openbis.columns:
+        raise ValueError("Input dataframe 'df_openbis' must contain a column named 'related_h5_indeces', resulting from suitable proprocessing steps.")
+
+    # TODO: as a safeguard, create exclude list containing properties that must not be changed 
+    exclude_list = ['filenumber','FILENUMBER','identifier']
+    for item in props_include_list:
+        if item in exclude_list:
+            props_include_list.remove(item)
+
+    trans = openbis_obj.new_transaction()
+    for sample_idx in len(range(df_openbis['identifier'])):
+
+        props_dict = compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx)
+        sample_path_identifier = props_dict['identifier'] #path-like index
+        sample =   sample_collection[sample_path_identifier]
+
+        for prop in props_dict.keys():
+            if prop in props_include_list:
+                sample.props[prop] = props_dict[prop]
+
+        trans.add(sample)
+
+    trans.commit()
+
+    return 0
+
+def conduct_dataframe_preprocessing_steps(df_h5, df_openbis):
+        
+    if not 'lastModifiedDatestr'in df_h5.columns:
+        raise ValueError('')
+
+    df_h5, df_openbis = align_datetime_observation_windows(df_h5, df_openbis, 'lastModifiedDatestr' , 'registrationDate')  
+    df_openbis = pair_openbis_and_h5_dataframes(df_openbis, df_h5, 'REFORMATED_FILENUMBER', 'name')
+
+    return df_h5, df_openbis
+
+
--- a/src/utils_bge.py
+++ b/src/utils_bge.py
@ -1,58 +1,58 @@
-import scipy.optimize as sp_opt
-import pandas as pd
-
-
-
-def construct_mask(x, subinterval_list):
-
-    """ constructs a mask of length len(x) that indicates whether the entries of x lie within the subintervals,
-    speficified in the subinterval_list.
-
-    Parameters:
-    x (array_like):
-    subinterval_list (list of two-element tuples):
-
-    Returns:
-    mask (Bool array_like):
-    
-    Usage:
-
-    x = np.array([0.0 0.25 0.5 0.75 1.5 2.0 2.5 3.0 3.5 4.0])
-    subinterval_list = [(0.25,0.75),(2.5,3.5)]
-    mask = contruct_mask(x,subinterval_list)     
-    
-    """
-
-    mask = x < x.min()
-    for subinterval in subinterval_list:
-        mask = mask | ((x >= subinterval[0]) & (x <= subinterval[1]))
-
-    return mask
-
-
-def estimate_background(x,y,mask,method: str):
-
-    """fits a background model based on the values of x and y indicated by a mask using a method, among available options.
-    
-    Parameters:
-    x,y (array_like, e.g., np.array, pd.Series): 
-    mask (Bool array_like):
-    method (str): 
-
-    Returns:
-    y_bg (array_like): values of the fitted model at x, or similarly the obtained background estimate
-    
-    """
-
-    if method == 'linear':
-        def linear_model(x,m,b):
-            return (m*x) + b
-        
-        popt, pcov = sp_opt.curve_fit(linear_model,x[mask],y[mask])
-
-        y_bg = linear_model(x,*popt)
-
-    else:
-        raise ValueError("Parameter 'method' can only be set as 'linear'. Future code releases may include more options. ")
-    
-    return y_bg
+import scipy.optimize as sp_opt
+import pandas as pd
+
+
+
+def construct_mask(x, subinterval_list):
+
+    """ constructs a mask of length len(x) that indicates whether the entries of x lie within the subintervals,
+    speficified in the subinterval_list.
+
+    Parameters:
+    x (array_like):
+    subinterval_list (list of two-element tuples):
+
+    Returns:
+    mask (Bool array_like):
+    
+    Usage:
+
+    x = np.array([0.0 0.25 0.5 0.75 1.5 2.0 2.5 3.0 3.5 4.0])
+    subinterval_list = [(0.25,0.75),(2.5,3.5)]
+    mask = contruct_mask(x,subinterval_list)     
+    
+    """
+
+    mask = x < x.min()
+    for subinterval in subinterval_list:
+        mask = mask | ((x >= subinterval[0]) & (x <= subinterval[1]))
+
+    return mask
+
+
+def estimate_background(x,y,mask,method: str):
+
+    """fits a background model based on the values of x and y indicated by a mask using a method, among available options.
+    
+    Parameters:
+    x,y (array_like, e.g., np.array, pd.Series): 
+    mask (Bool array_like):
+    method (str): 
+
+    Returns:
+    y_bg (array_like): values of the fitted model at x, or similarly the obtained background estimate
+    
+    """
+
+    if method == 'linear':
+        def linear_model(x,m,b):
+            return (m*x) + b
+        
+        popt, pcov = sp_opt.curve_fit(linear_model,x[mask],y[mask])
+
+        y_bg = linear_model(x,*popt)
+
+    else:
+        raise ValueError("Parameter 'method' can only be set as 'linear'. Future code releases may include more options. ")
+    
+    return y_bg