Synch with remote repo

This commit is contained in:
2025-02-03 10:31:48 +01:00
parent a3ccff4079
commit 32bba4239a
102 changed files with 19584 additions and 19584 deletions

View File

@ -1,358 +1,358 @@
import subprocess
import os
import utils.g5505_utils as utils
from pipelines.metadata_revision import update_hdf5_file_with_review
def perform_git_operations(hdf5_upload):
status_command = ['git', 'status']
status = subprocess.run(status_command, capture_output=True, check=True)
if hdf5_upload:
upload_ext = ['.h5', '.yaml']
else:
upload_ext = ['.yaml']
files_to_add_list = extract_files_to_add(status.stdout, upload_ext)
if files_to_add_list:
add_files_to_git(files_to_add_list)
commit_changes('Updated hdf5 file with yaml review file.')
else:
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
def extract_files_to_add(git_status_output, upload_ext):
files_to_add_list = []
for line in git_status_output.splitlines():
tmp = line.decode("utf-8")
if 'modified' in tmp:
if any(ext in tmp for ext in upload_ext):
files_to_add_list.append(tmp.split()[1])
return files_to_add_list
def add_files_to_git(files_to_add_list):
add_command = ['git', 'add'] + files_to_add_list
subprocess.run(add_command, capture_output=True, check=True)
def commit_changes(message):
commit_command = ['git', 'commit', '-m', message]
commit_output = subprocess.run(commit_command, capture_output=True, check=True)
print(commit_output.stdout)
def get_status():
return subprocess.run(['git','status'],capture_output=True,text=True,check=True)
def show_current_branch():
current_branch_command = ['git','branch','--show-current']
subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
YAML_EXT = ".yaml"
TXT_EXT = ".txt"
def get_review_status(filename_path):
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# TODO:
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
workflow_steps = []
for line in f:
workflow_steps.append(line)
return workflow_steps[-1]
def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
"""
First: Initialize review branch with review folder with a copy of yaml representation of
hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
"""
initials = reviewer_attrs['initials']
#branch_name = '-'.join([reviewer_attrs['type'],'review_',initials])
branch_name = '_'.join(['review',initials])
hdf5_file_path_tail, filename_path_head = os.path.split(hdf5_file_path)
filename, ext = os.path.splitext(filename_path_head)
# Check file_path points to h5 file
if not 'h5' in ext:
raise ValueError("filename_path needs to point to an h5 file.")
# Verify if yaml snapshot of input h5 file exists
if not os.path.exists(os.path.join(hdf5_file_path_tail,filename+YAML_EXT)):
raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run serialize_metadata(filename_path) ")
# Initialize metadata review workflow
# print("Create branch metadata-review-by-"+initials+"\n")
#checkout_review_branch(branch_name)
# Check you are working at the right branch
curr_branch = show_current_branch()
if not branch_name in curr_branch.stdout:
raise ValueError("Branch "+branch_name+" was not found. \nPlease open a Git Bash Terminal, and follow the below instructions: \n1. Change directory to your project's directory. \n2. Excecute the command: git checkout "+branch_name)
# Check if review file already exists and then check if it is still untracked
review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)
review_status_yaml_file_path = os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")
if not os.path.exists(review_yaml_file_path) or restart:
review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review')
if restart:
print('metadata review has been reinitialized. The review files will reflect the current state of the hdf5 files metadata')
#if not os.path.exists(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")):
with open(review_status_yaml_file_path,'w') as f:
f.write('under review')
# Stage untracked review files and commit them to local repository
status = get_status()
untracked_files = []
for line in status.stdout.splitlines():
#tmp = line.decode("utf-8")
#modified_files.append(tmp.split()[1])
if 'review/' in line:
if not 'modified' in line: # untracked filesand
untracked_files.append(line.strip())
else:
untracked_files.append(line.strip().split()[1])
if 'output_files/'+filename+YAML_EXT in line and not 'modified' in line:
untracked_files.append(line.strip())
if untracked_files:
result = subprocess.run(add_files_to_git(untracked_files),capture_output=True,check=True)
message = 'Initialized metadata review.'
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
for line in commit_output.stdout.splitlines():
print(line.decode('utf-8'))
#else:
# print('This action will not have any effect because metadata review process has been already initialized.')
#status_dict = repo_obj.status()
#for filepath, file_status in status_dict.items():
# Identify keys associated to review files and stage them
# if 'review/'+filename in filepath:
# Stage changes
# repo_obj.index.add(filepath)
#author = config_file.author #default_signature
#committer = config_file.committer
#message = "Initialized metadata review process."
#tree = repo_obj.index.write_tree()
#oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
#print("Add and commit"+"\n")
return review_yaml_file_path, review_status_yaml_file_path
def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
"""
Second: Once you're done reviewing the yaml representation of hdf5 file in review folder.
Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
running this function.
"""
# 1 verify review initializatin was performed first
# 2. change review status in txt to complete
# 3. git add review/ and git commit -m "Submitted metadata review"
initials = reviewer_attrs['initials']
#branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
branch_name = '_'.join(['review',initials])
# TODO: replace with subprocess + git
#checkout_review_branch(repo_obj, branch_name)
# Check you are working at the right branch
curr_branch = show_current_branch()
if not branch_name in curr_branch.stdout:
raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
# Collect modified review files
status = get_status()
modified_files = []
os.path.basename(review_yaml_file_path)
for line in status.stdout.splitlines():
# conver line from bytes to str
tmp = line.decode("utf-8")
if 'modified' in tmp and 'review/' in tmp and os.path.basename(review_yaml_file_path) in tmp:
modified_files.append(tmp.split()[1])
# Stage modified files and commit them to local repository
review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
filename, ext = os.path.splitext(review_yaml_file_path_head)
if modified_files:
review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
with open(review_status_file_path,'a') as f:
f.write('\nsubmitted')
modified_files.append(review_status_file_path)
result = subprocess.run(add_files_to_git(modified_files),capture_output=True,check=True)
message = 'Submitted metadata review.'
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
for line in commit_output.stdout.splitlines():
print(line.decode('utf-8'))
else:
print('Nothing to commit.')
#
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
if 'submitted' not in get_review_status(input_hdf5_file):
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
perform_git_operations(hdf5_upload)
def last_submit_metadata_review(reviewer_attrs):
"""Fourth: """
initials =reviewer_attrs['initials']
repository = 'origin'
branch_name = '_'.join(['review',initials])
push_command = lambda repository,refspec: ['git','push',repository,refspec]
list_branches_command = ['git','branch','--list']
branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
if not branch_name in branches.stdout:
print('There is no branch named '+branch_name+'.\n')
print('Make sure to run data owner review workflow from the beginning without missing any steps.')
return
curr_branch = show_current_branch()
if not branch_name in curr_branch.stdout:
print('Complete metadata review could not be completed.\n')
print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
print('The step "Complete metadata review" will have no effect.')
return
# push
result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
print(result.stdout)
# 1. git add output_files/
# 2. delete review/
#shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
# 3. git rm review/
# 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
return result.returncode
#import config_file
#import hdf5_ops
class MetadataHarvester:
def __init__(self, parent_files=None):
if parent_files is None:
parent_files = []
self.parent_files = parent_files
self.metadata = {
"project": {},
"sample": {},
"environment": {},
"instruments": {},
"datasets": {}
}
def add_project_info(self, key_or_dict, value=None, append=False):
self._add_info("project", key_or_dict, value, append)
def add_sample_info(self, key_or_dict, value=None, append=False):
self._add_info("sample", key_or_dict, value, append)
def add_environment_info(self, key_or_dict, value=None, append=False):
self._add_info("environment", key_or_dict, value, append)
def add_instrument_info(self, key_or_dict, value=None, append=False):
self._add_info("instruments", key_or_dict, value, append)
def add_dataset_info(self, key_or_dict, value=None, append=False):
self._add_info("datasets", key_or_dict, value, append)
def _add_info(self, category, key_or_dict, value, append):
"""Internal helper method to add information to a category."""
if isinstance(key_or_dict, dict):
self.metadata[category].update(key_or_dict)
else:
if key_or_dict in self.metadata[category]:
if append:
current_value = self.metadata[category][key_or_dict]
if isinstance(current_value, list):
if not isinstance(value, list):
# Append the new value to the list
self.metadata[category][key_or_dict].append(value)
else:
self.metadata[category][key_or_dict] = current_value + value
elif isinstance(current_value, str):
# Append the new value as a comma-separated string
self.metadata[category][key_or_dict] = current_value + ',' + str(value)
else:
# Handle other types (for completeness, usually not required)
self.metadata[category][key_or_dict] = [current_value, value]
else:
self.metadata[category][key_or_dict] = value
else:
self.metadata[category][key_or_dict] = value
def get_metadata(self):
return {
"parent_files": self.parent_files,
"metadata": self.metadata
}
def print_metadata(self):
print("parent_files", self.parent_files)
for key in self.metadata.keys():
print(key,'metadata:\n')
for item in self.metadata[key].items():
print(item[0],item[1])
def clear_metadata(self):
self.metadata = {
"project": {},
"sample": {},
"environment": {},
"instruments": {},
"datasets": {}
}
self.parent_files = []
def main():
output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
#output_yml_filename_path = hdf5_ops.serialize_metadata(output_filename_path)
#first_initialize_metadata_review(output_filename_path,initials='NG')
#second_submit_metadata_review()
#if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
# third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
#fourth_complete_metadata_review()
import subprocess
import os
import utils.g5505_utils as utils
from pipelines.metadata_revision import update_hdf5_file_with_review
def perform_git_operations(hdf5_upload):
status_command = ['git', 'status']
status = subprocess.run(status_command, capture_output=True, check=True)
if hdf5_upload:
upload_ext = ['.h5', '.yaml']
else:
upload_ext = ['.yaml']
files_to_add_list = extract_files_to_add(status.stdout, upload_ext)
if files_to_add_list:
add_files_to_git(files_to_add_list)
commit_changes('Updated hdf5 file with yaml review file.')
else:
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
def extract_files_to_add(git_status_output, upload_ext):
files_to_add_list = []
for line in git_status_output.splitlines():
tmp = line.decode("utf-8")
if 'modified' in tmp:
if any(ext in tmp for ext in upload_ext):
files_to_add_list.append(tmp.split()[1])
return files_to_add_list
def add_files_to_git(files_to_add_list):
add_command = ['git', 'add'] + files_to_add_list
subprocess.run(add_command, capture_output=True, check=True)
def commit_changes(message):
commit_command = ['git', 'commit', '-m', message]
commit_output = subprocess.run(commit_command, capture_output=True, check=True)
print(commit_output.stdout)
def get_status():
return subprocess.run(['git','status'],capture_output=True,text=True,check=True)
def show_current_branch():
current_branch_command = ['git','branch','--show-current']
subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
YAML_EXT = ".yaml"
TXT_EXT = ".txt"
def get_review_status(filename_path):
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# TODO:
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
workflow_steps = []
for line in f:
workflow_steps.append(line)
return workflow_steps[-1]
def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
"""
First: Initialize review branch with review folder with a copy of yaml representation of
hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
"""
initials = reviewer_attrs['initials']
#branch_name = '-'.join([reviewer_attrs['type'],'review_',initials])
branch_name = '_'.join(['review',initials])
hdf5_file_path_tail, filename_path_head = os.path.split(hdf5_file_path)
filename, ext = os.path.splitext(filename_path_head)
# Check file_path points to h5 file
if not 'h5' in ext:
raise ValueError("filename_path needs to point to an h5 file.")
# Verify if yaml snapshot of input h5 file exists
if not os.path.exists(os.path.join(hdf5_file_path_tail,filename+YAML_EXT)):
raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run serialize_metadata(filename_path) ")
# Initialize metadata review workflow
# print("Create branch metadata-review-by-"+initials+"\n")
#checkout_review_branch(branch_name)
# Check you are working at the right branch
curr_branch = show_current_branch()
if not branch_name in curr_branch.stdout:
raise ValueError("Branch "+branch_name+" was not found. \nPlease open a Git Bash Terminal, and follow the below instructions: \n1. Change directory to your project's directory. \n2. Excecute the command: git checkout "+branch_name)
# Check if review file already exists and then check if it is still untracked
review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)
review_status_yaml_file_path = os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")
if not os.path.exists(review_yaml_file_path) or restart:
review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review')
if restart:
print('metadata review has been reinitialized. The review files will reflect the current state of the hdf5 files metadata')
#if not os.path.exists(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")):
with open(review_status_yaml_file_path,'w') as f:
f.write('under review')
# Stage untracked review files and commit them to local repository
status = get_status()
untracked_files = []
for line in status.stdout.splitlines():
#tmp = line.decode("utf-8")
#modified_files.append(tmp.split()[1])
if 'review/' in line:
if not 'modified' in line: # untracked filesand
untracked_files.append(line.strip())
else:
untracked_files.append(line.strip().split()[1])
if 'output_files/'+filename+YAML_EXT in line and not 'modified' in line:
untracked_files.append(line.strip())
if untracked_files:
result = subprocess.run(add_files_to_git(untracked_files),capture_output=True,check=True)
message = 'Initialized metadata review.'
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
for line in commit_output.stdout.splitlines():
print(line.decode('utf-8'))
#else:
# print('This action will not have any effect because metadata review process has been already initialized.')
#status_dict = repo_obj.status()
#for filepath, file_status in status_dict.items():
# Identify keys associated to review files and stage them
# if 'review/'+filename in filepath:
# Stage changes
# repo_obj.index.add(filepath)
#author = config_file.author #default_signature
#committer = config_file.committer
#message = "Initialized metadata review process."
#tree = repo_obj.index.write_tree()
#oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
#print("Add and commit"+"\n")
return review_yaml_file_path, review_status_yaml_file_path
def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
"""
Second: Once you're done reviewing the yaml representation of hdf5 file in review folder.
Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
running this function.
"""
# 1 verify review initializatin was performed first
# 2. change review status in txt to complete
# 3. git add review/ and git commit -m "Submitted metadata review"
initials = reviewer_attrs['initials']
#branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
branch_name = '_'.join(['review',initials])
# TODO: replace with subprocess + git
#checkout_review_branch(repo_obj, branch_name)
# Check you are working at the right branch
curr_branch = show_current_branch()
if not branch_name in curr_branch.stdout:
raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
# Collect modified review files
status = get_status()
modified_files = []
os.path.basename(review_yaml_file_path)
for line in status.stdout.splitlines():
# conver line from bytes to str
tmp = line.decode("utf-8")
if 'modified' in tmp and 'review/' in tmp and os.path.basename(review_yaml_file_path) in tmp:
modified_files.append(tmp.split()[1])
# Stage modified files and commit them to local repository
review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
filename, ext = os.path.splitext(review_yaml_file_path_head)
if modified_files:
review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
with open(review_status_file_path,'a') as f:
f.write('\nsubmitted')
modified_files.append(review_status_file_path)
result = subprocess.run(add_files_to_git(modified_files),capture_output=True,check=True)
message = 'Submitted metadata review.'
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
for line in commit_output.stdout.splitlines():
print(line.decode('utf-8'))
else:
print('Nothing to commit.')
#
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
if 'submitted' not in get_review_status(input_hdf5_file):
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
perform_git_operations(hdf5_upload)
def last_submit_metadata_review(reviewer_attrs):
"""Fourth: """
initials =reviewer_attrs['initials']
repository = 'origin'
branch_name = '_'.join(['review',initials])
push_command = lambda repository,refspec: ['git','push',repository,refspec]
list_branches_command = ['git','branch','--list']
branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
if not branch_name in branches.stdout:
print('There is no branch named '+branch_name+'.\n')
print('Make sure to run data owner review workflow from the beginning without missing any steps.')
return
curr_branch = show_current_branch()
if not branch_name in curr_branch.stdout:
print('Complete metadata review could not be completed.\n')
print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
print('The step "Complete metadata review" will have no effect.')
return
# push
result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
print(result.stdout)
# 1. git add output_files/
# 2. delete review/
#shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
# 3. git rm review/
# 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
return result.returncode
#import config_file
#import hdf5_ops
class MetadataHarvester:
def __init__(self, parent_files=None):
if parent_files is None:
parent_files = []
self.parent_files = parent_files
self.metadata = {
"project": {},
"sample": {},
"environment": {},
"instruments": {},
"datasets": {}
}
def add_project_info(self, key_or_dict, value=None, append=False):
self._add_info("project", key_or_dict, value, append)
def add_sample_info(self, key_or_dict, value=None, append=False):
self._add_info("sample", key_or_dict, value, append)
def add_environment_info(self, key_or_dict, value=None, append=False):
self._add_info("environment", key_or_dict, value, append)
def add_instrument_info(self, key_or_dict, value=None, append=False):
self._add_info("instruments", key_or_dict, value, append)
def add_dataset_info(self, key_or_dict, value=None, append=False):
self._add_info("datasets", key_or_dict, value, append)
def _add_info(self, category, key_or_dict, value, append):
"""Internal helper method to add information to a category."""
if isinstance(key_or_dict, dict):
self.metadata[category].update(key_or_dict)
else:
if key_or_dict in self.metadata[category]:
if append:
current_value = self.metadata[category][key_or_dict]
if isinstance(current_value, list):
if not isinstance(value, list):
# Append the new value to the list
self.metadata[category][key_or_dict].append(value)
else:
self.metadata[category][key_or_dict] = current_value + value
elif isinstance(current_value, str):
# Append the new value as a comma-separated string
self.metadata[category][key_or_dict] = current_value + ',' + str(value)
else:
# Handle other types (for completeness, usually not required)
self.metadata[category][key_or_dict] = [current_value, value]
else:
self.metadata[category][key_or_dict] = value
else:
self.metadata[category][key_or_dict] = value
def get_metadata(self):
return {
"parent_files": self.parent_files,
"metadata": self.metadata
}
def print_metadata(self):
print("parent_files", self.parent_files)
for key in self.metadata.keys():
print(key,'metadata:\n')
for item in self.metadata[key].items():
print(item[0],item[1])
def clear_metadata(self):
self.metadata = {
"project": {},
"sample": {},
"environment": {},
"instruments": {},
"datasets": {}
}
self.parent_files = []
def main():
output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
#output_yml_filename_path = hdf5_ops.serialize_metadata(output_filename_path)
#first_initialize_metadata_review(output_filename_path,initials='NG')
#second_submit_metadata_review()
#if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
# third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
#fourth_complete_metadata_review()

File diff suppressed because it is too large Load Diff

View File

@ -1,396 +1,396 @@
import sys
import os
root_dir = os.path.abspath(os.curdir)
sys.path.append(root_dir)
import pandas as pd
import numpy as np
import h5py
import logging
import utils.g5505_utils as utils
import instruments.readers.filereader_registry as filereader_registry
def __transfer_file_dict_to_hdf5(h5file, group_name, file_dict):
"""
Transfers data from a file_dict to an HDF5 file.
Parameters
----------
h5file : h5py.File
HDF5 file object where the data will be written.
group_name : str
Name of the HDF5 group where data will be stored.
file_dict : dict
Dictionary containing file data to be transferred. Required structure:
{
'name': str,
'attributes_dict': dict,
'datasets': [
{
'name': str,
'data': array-like,
'shape': tuple,
'attributes': dict (optional)
},
...
]
}
Returns
-------
None
"""
if not file_dict:
return
try:
# Create group and add their attributes
filename = file_dict['name']
group = h5file[group_name].create_group(name=filename)
# Add group attributes
group.attrs.update(file_dict['attributes_dict'])
# Add datasets to the just created group
for dataset in file_dict['datasets']:
dataset_obj = group.create_dataset(
name=dataset['name'],
data=dataset['data'],
shape=dataset['shape']
)
# Add dataset's attributes
attributes = dataset.get('attributes', {})
dataset_obj.attrs.update(attributes)
group.attrs['last_update_date'] = utils.created_at().encode('utf-8')
stdout = f'Completed transfer for /{group_name}/{filename}'
except Exception as inst:
stdout = inst
logging.error('Failed to transfer data into HDF5: %s', inst)
return stdout
def __copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name, work_with_copy : bool = True):
# Create copy of original file to avoid possible file corruption and work with it.
if work_with_copy:
tmp_file_path = utils.make_file_copy(source_file_path)
else:
tmp_file_path = source_file_path
# Open backup h5 file and copy complet filesystem directory onto a group in h5file
with h5py.File(tmp_file_path,'r') as src_file:
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
if 'tmp_files' in tmp_file_path:
os.remove(tmp_file_path)
stdout = f'Completed transfer for /{dest_group_name}'
return stdout
def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
path_to_filenames_dict: dict = None,
select_dir_keywords : list = [],
root_metadata_dict : dict = {}, mode = 'w'):
"""
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
of a given filesystem path.
The data integration capabilities are limited by our file reader, which can only access data from a list of
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
Files are formatted as composite objects consisting of a group, file, and attributes.
Parameters
----------
output_filename : str
Name of the output HDF5 file.
path_to_input_directory : str
Path to root directory, specified with forward slashes, e.g., path/to/root.
path_to_filenames_dict : dict, optional
A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
If provided, 'input_file_system_path' is ignored.
select_dir_keywords : list
List of string elements to consider or select only directory paths that contain
a word in 'select_dir_keywords'. When empty, all directory paths are considered
to be included in the HDF5 file group hierarchy.
root_metadata_dict : dict
Metadata to include at the root level of the HDF5 file.
mode : str
'w' create File, truncate if it exists, or 'r+' read/write, File must exists. By default, mode = "w".
Returns
-------
output_filename : str
Path to the created HDF5 file.
"""
if not mode in ['w','r+']:
raise ValueError(f'Parameter mode must take values in ["w","r+"]')
if not '/' in path_to_input_directory:
raise ValueError('path_to_input_directory needs to be specified using forward slashes "/".' )
#path_to_output_directory = os.path.join(path_to_input_directory,'..')
path_to_input_directory = os.path.normpath(path_to_input_directory).rstrip(os.sep)
for i, keyword in enumerate(select_dir_keywords):
select_dir_keywords[i] = keyword.replace('/',os.sep)
if not path_to_filenames_dict:
# On dry_run=True, returns path to files dictionary of the output directory without making a actual copy of the input directory.
# Therefore, there wont be a copying conflict by setting up input and output directories the same
path_to_filenames_dict = utils.copy_directory_with_contraints(input_dir_path=path_to_input_directory,
output_dir_path=path_to_input_directory,
dry_run=True)
# Set input_directory as copied input directory
root_dir = path_to_input_directory
path_to_output_file = path_to_input_directory.rstrip(os.path.sep) + '.h5'
start_message = f'\n[Start] Data integration :\nSource: {path_to_input_directory}\nDestination: {path_to_output_file}\n'
print(start_message)
logging.info(start_message)
# Check if the .h5 file already exists
if os.path.exists(path_to_output_file) and mode in ['w']:
message = (
f"[Notice] The file '{path_to_output_file}' already exists and will not be overwritten.\n"
"If you wish to replace it, please delete the existing file first and rerun the program."
)
print(message)
logging.error(message)
else:
with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
number_of_dirs = len(path_to_filenames_dict.keys())
dir_number = 1
for dirpath, filtered_filenames_list in path_to_filenames_dict.items():
# Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict.
if not filtered_filenames_list:
continue
group_name = dirpath.replace(os.sep,'/')
group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
# Flatten group name to one level
if select_dir_keywords:
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
else:
offset = 1
tmp_list = group_name.split('/')
if len(tmp_list) > offset+1:
group_name = '/'.join([tmp_list[i] for i in range(offset+1)])
# Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes
if not group_name in h5file.keys():
h5file.create_group(group_name)
h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8')
#h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list))
#h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list))
#else:
#print(group_name,' was already created.')
instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}'
print(instFoldermsgStart)
for filenumber, filename in enumerate(filtered_filenames_list):
#file_ext = os.path.splitext(filename)[1]
#try:
# hdf5 path to filename group
dest_group_name = f'{group_name}/{filename}'
if not 'h5' in filename:
#file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
#file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename))
stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict)
else:
source_file_path = os.path.join(dirpath,filename)
dest_file_obj = h5file
#group_name +'/'+filename
#ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name)
#g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name)
stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False)
# Update the progress bar and log the end message
instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n'
# Print and log the start message
utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd)
logging.info(instFoldermsdEnd )
dir_number = dir_number + 1
print('[End] Data integration')
logging.info('[End] Data integration')
if len(root_metadata_dict.keys())>0:
for key, value in root_metadata_dict.items():
#if key in h5file.attrs:
# del h5file.attrs[key]
h5file.attrs.create(key, value)
#annotate_root_dir(output_filename,root_metadata_dict)
#output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
return path_to_output_file #, output_yml_filename_path
def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None):
"""
Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns.
Parameters:
-----------
ofilename (str): Path for the output HDF5 file.
input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path.
group_by_funcs (list): List of callables or column names to define hierarchical grouping.
approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file.
extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups.
Returns:
--------
None
"""
# Check whether input_data is a valid file-system path or a DataFrame
is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False
if is_valid_path(input_data):
# If input_data is a file-system path, create a DataFrame with file info
file_list = os.listdir(input_data)
df = pd.DataFrame(file_list, columns=['filename'])
df = utils.augment_with_filetype(df) # Add filetype information if needed
elif isinstance(input_data, pd.DataFrame):
# If input_data is a DataFrame, make a copy
df = input_data.copy()
else:
raise ValueError("input_data must be either a valid file-system path or a DataFrame.")
# Generate grouping columns based on group_by_funcs
if utils.is_callable_list(group_by_funcs):
grouping_cols = []
for i, func in enumerate(group_by_funcs):
col_name = f'level_{i}_groups'
grouping_cols.append(col_name)
df[col_name] = func(df)
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
grouping_cols = group_by_funcs
else:
raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.")
# Generate group paths
df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)]
# Open the HDF5 file in write mode
with h5py.File(ofilename, 'w') as file:
for group_path in df['group_path'].unique():
# Create groups in HDF5
group = file.create_group(group_path)
# Filter the DataFrame for the current group
datatable = df[df['group_path'] == group_path].copy()
# Drop grouping columns and the generated 'group_path'
datatable = datatable.drop(columns=grouping_cols + ['group_path'])
# Add datasets to groups if data exists
if not datatable.empty:
dataset = utils.convert_dataframe_to_np_structured_array(datatable)
group.create_dataset(name='data_table', data=dataset)
# Add attributes if extract_attrs_func is provided
if extract_attrs_func:
attrs = extract_attrs_func(datatable)
for key, value in attrs.items():
group.attrs[key] = value
# Save metadata about depth of hierarchy
file.attrs.create(name='depth', data=len(grouping_cols) - 1)
print(f"HDF5 file created successfully at {ofilename}")
return ofilename
def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
"""
Save processed dataframe columns with annotations to an HDF5 file.
Parameters:
df (pd.DataFrame): DataFrame containing processed time series.
annotator (): Annotator object with get_metadata method.
output_filename (str): Path to the source HDF5 file.
"""
# Convert datetime columns to string
datetime_cols = df.select_dtypes(include=['datetime64']).columns
if list(datetime_cols):
df[datetime_cols] = df[datetime_cols].map(str)
# Convert dataframe to structured array
icad_data_table = utils.convert_dataframe_to_np_structured_array(df)
# Get metadata
metadata_dict = annotator.get_metadata()
# Prepare project level attributes to be added at the root level
project_level_attributes = metadata_dict['metadata']['project']
# Prepare high-level attributes
high_level_attributes = {
'parent_files': metadata_dict['parent_files'],
**metadata_dict['metadata']['sample'],
**metadata_dict['metadata']['environment'],
**metadata_dict['metadata']['instruments']
}
# Prepare data level attributes
data_level_attributes = metadata_dict['metadata']['datasets']
for key, value in data_level_attributes.items():
if isinstance(value,dict):
data_level_attributes[key] = utils.convert_attrdict_to_np_structured_array(value)
# Prepare file dictionary
file_dict = {
'name': project_level_attributes['processing_file'],
'attributes_dict': high_level_attributes,
'datasets': [{
'name': "data_table",
'data': icad_data_table,
'shape': icad_data_table.shape,
'attributes': data_level_attributes
}]
}
# Check if the file exists
if os.path.exists(output_filename):
mode = "a"
print(f"File {output_filename} exists. Opening in append mode.")
else:
mode = "w"
print(f"File {output_filename} does not exist. Creating a new file.")
# Write to HDF5
with h5py.File(output_filename, mode) as h5file:
# Add project level attributes at the root/top level
h5file.attrs.update(project_level_attributes)
__transfer_file_dict_to_hdf5(h5file, '/', file_dict)
#if __name__ == '__main__':
import sys
import os
root_dir = os.path.abspath(os.curdir)
sys.path.append(root_dir)
import pandas as pd
import numpy as np
import h5py
import logging
import utils.g5505_utils as utils
import instruments.readers.filereader_registry as filereader_registry
def __transfer_file_dict_to_hdf5(h5file, group_name, file_dict):
"""
Transfers data from a file_dict to an HDF5 file.
Parameters
----------
h5file : h5py.File
HDF5 file object where the data will be written.
group_name : str
Name of the HDF5 group where data will be stored.
file_dict : dict
Dictionary containing file data to be transferred. Required structure:
{
'name': str,
'attributes_dict': dict,
'datasets': [
{
'name': str,
'data': array-like,
'shape': tuple,
'attributes': dict (optional)
},
...
]
}
Returns
-------
None
"""
if not file_dict:
return
try:
# Create group and add their attributes
filename = file_dict['name']
group = h5file[group_name].create_group(name=filename)
# Add group attributes
group.attrs.update(file_dict['attributes_dict'])
# Add datasets to the just created group
for dataset in file_dict['datasets']:
dataset_obj = group.create_dataset(
name=dataset['name'],
data=dataset['data'],
shape=dataset['shape']
)
# Add dataset's attributes
attributes = dataset.get('attributes', {})
dataset_obj.attrs.update(attributes)
group.attrs['last_update_date'] = utils.created_at().encode('utf-8')
stdout = f'Completed transfer for /{group_name}/{filename}'
except Exception as inst:
stdout = inst
logging.error('Failed to transfer data into HDF5: %s', inst)
return stdout
def __copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name, work_with_copy : bool = True):
# Create copy of original file to avoid possible file corruption and work with it.
if work_with_copy:
tmp_file_path = utils.make_file_copy(source_file_path)
else:
tmp_file_path = source_file_path
# Open backup h5 file and copy complet filesystem directory onto a group in h5file
with h5py.File(tmp_file_path,'r') as src_file:
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
if 'tmp_files' in tmp_file_path:
os.remove(tmp_file_path)
stdout = f'Completed transfer for /{dest_group_name}'
return stdout
def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
path_to_filenames_dict: dict = None,
select_dir_keywords : list = [],
root_metadata_dict : dict = {}, mode = 'w'):
"""
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
of a given filesystem path.
The data integration capabilities are limited by our file reader, which can only access data from a list of
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
Files are formatted as composite objects consisting of a group, file, and attributes.
Parameters
----------
output_filename : str
Name of the output HDF5 file.
path_to_input_directory : str
Path to root directory, specified with forward slashes, e.g., path/to/root.
path_to_filenames_dict : dict, optional
A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
If provided, 'input_file_system_path' is ignored.
select_dir_keywords : list
List of string elements to consider or select only directory paths that contain
a word in 'select_dir_keywords'. When empty, all directory paths are considered
to be included in the HDF5 file group hierarchy.
root_metadata_dict : dict
Metadata to include at the root level of the HDF5 file.
mode : str
'w' create File, truncate if it exists, or 'r+' read/write, File must exists. By default, mode = "w".
Returns
-------
output_filename : str
Path to the created HDF5 file.
"""
if not mode in ['w','r+']:
raise ValueError(f'Parameter mode must take values in ["w","r+"]')
if not '/' in path_to_input_directory:
raise ValueError('path_to_input_directory needs to be specified using forward slashes "/".' )
#path_to_output_directory = os.path.join(path_to_input_directory,'..')
path_to_input_directory = os.path.normpath(path_to_input_directory).rstrip(os.sep)
for i, keyword in enumerate(select_dir_keywords):
select_dir_keywords[i] = keyword.replace('/',os.sep)
if not path_to_filenames_dict:
# On dry_run=True, returns path to files dictionary of the output directory without making a actual copy of the input directory.
# Therefore, there wont be a copying conflict by setting up input and output directories the same
path_to_filenames_dict = utils.copy_directory_with_contraints(input_dir_path=path_to_input_directory,
output_dir_path=path_to_input_directory,
dry_run=True)
# Set input_directory as copied input directory
root_dir = path_to_input_directory
path_to_output_file = path_to_input_directory.rstrip(os.path.sep) + '.h5'
start_message = f'\n[Start] Data integration :\nSource: {path_to_input_directory}\nDestination: {path_to_output_file}\n'
print(start_message)
logging.info(start_message)
# Check if the .h5 file already exists
if os.path.exists(path_to_output_file) and mode in ['w']:
message = (
f"[Notice] The file '{path_to_output_file}' already exists and will not be overwritten.\n"
"If you wish to replace it, please delete the existing file first and rerun the program."
)
print(message)
logging.error(message)
else:
with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
number_of_dirs = len(path_to_filenames_dict.keys())
dir_number = 1
for dirpath, filtered_filenames_list in path_to_filenames_dict.items():
# Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict.
if not filtered_filenames_list:
continue
group_name = dirpath.replace(os.sep,'/')
group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
# Flatten group name to one level
if select_dir_keywords:
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
else:
offset = 1
tmp_list = group_name.split('/')
if len(tmp_list) > offset+1:
group_name = '/'.join([tmp_list[i] for i in range(offset+1)])
# Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes
if not group_name in h5file.keys():
h5file.create_group(group_name)
h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8')
#h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list))
#h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list))
#else:
#print(group_name,' was already created.')
instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}'
print(instFoldermsgStart)
for filenumber, filename in enumerate(filtered_filenames_list):
#file_ext = os.path.splitext(filename)[1]
#try:
# hdf5 path to filename group
dest_group_name = f'{group_name}/{filename}'
if not 'h5' in filename:
#file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
#file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename))
stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict)
else:
source_file_path = os.path.join(dirpath,filename)
dest_file_obj = h5file
#group_name +'/'+filename
#ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name)
#g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name)
stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False)
# Update the progress bar and log the end message
instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n'
# Print and log the start message
utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd)
logging.info(instFoldermsdEnd )
dir_number = dir_number + 1
print('[End] Data integration')
logging.info('[End] Data integration')
if len(root_metadata_dict.keys())>0:
for key, value in root_metadata_dict.items():
#if key in h5file.attrs:
# del h5file.attrs[key]
h5file.attrs.create(key, value)
#annotate_root_dir(output_filename,root_metadata_dict)
#output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
return path_to_output_file #, output_yml_filename_path
def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None):
"""
Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns.
Parameters:
-----------
ofilename (str): Path for the output HDF5 file.
input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path.
group_by_funcs (list): List of callables or column names to define hierarchical grouping.
approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file.
extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups.
Returns:
--------
None
"""
# Check whether input_data is a valid file-system path or a DataFrame
is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False
if is_valid_path(input_data):
# If input_data is a file-system path, create a DataFrame with file info
file_list = os.listdir(input_data)
df = pd.DataFrame(file_list, columns=['filename'])
df = utils.augment_with_filetype(df) # Add filetype information if needed
elif isinstance(input_data, pd.DataFrame):
# If input_data is a DataFrame, make a copy
df = input_data.copy()
else:
raise ValueError("input_data must be either a valid file-system path or a DataFrame.")
# Generate grouping columns based on group_by_funcs
if utils.is_callable_list(group_by_funcs):
grouping_cols = []
for i, func in enumerate(group_by_funcs):
col_name = f'level_{i}_groups'
grouping_cols.append(col_name)
df[col_name] = func(df)
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
grouping_cols = group_by_funcs
else:
raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.")
# Generate group paths
df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)]
# Open the HDF5 file in write mode
with h5py.File(ofilename, 'w') as file:
for group_path in df['group_path'].unique():
# Create groups in HDF5
group = file.create_group(group_path)
# Filter the DataFrame for the current group
datatable = df[df['group_path'] == group_path].copy()
# Drop grouping columns and the generated 'group_path'
datatable = datatable.drop(columns=grouping_cols + ['group_path'])
# Add datasets to groups if data exists
if not datatable.empty:
dataset = utils.convert_dataframe_to_np_structured_array(datatable)
group.create_dataset(name='data_table', data=dataset)
# Add attributes if extract_attrs_func is provided
if extract_attrs_func:
attrs = extract_attrs_func(datatable)
for key, value in attrs.items():
group.attrs[key] = value
# Save metadata about depth of hierarchy
file.attrs.create(name='depth', data=len(grouping_cols) - 1)
print(f"HDF5 file created successfully at {ofilename}")
return ofilename
def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
"""
Save processed dataframe columns with annotations to an HDF5 file.
Parameters:
df (pd.DataFrame): DataFrame containing processed time series.
annotator (): Annotator object with get_metadata method.
output_filename (str): Path to the source HDF5 file.
"""
# Convert datetime columns to string
datetime_cols = df.select_dtypes(include=['datetime64']).columns
if list(datetime_cols):
df[datetime_cols] = df[datetime_cols].map(str)
# Convert dataframe to structured array
icad_data_table = utils.convert_dataframe_to_np_structured_array(df)
# Get metadata
metadata_dict = annotator.get_metadata()
# Prepare project level attributes to be added at the root level
project_level_attributes = metadata_dict['metadata']['project']
# Prepare high-level attributes
high_level_attributes = {
'parent_files': metadata_dict['parent_files'],
**metadata_dict['metadata']['sample'],
**metadata_dict['metadata']['environment'],
**metadata_dict['metadata']['instruments']
}
# Prepare data level attributes
data_level_attributes = metadata_dict['metadata']['datasets']
for key, value in data_level_attributes.items():
if isinstance(value,dict):
data_level_attributes[key] = utils.convert_attrdict_to_np_structured_array(value)
# Prepare file dictionary
file_dict = {
'name': project_level_attributes['processing_file'],
'attributes_dict': high_level_attributes,
'datasets': [{
'name': "data_table",
'data': icad_data_table,
'shape': icad_data_table.shape,
'attributes': data_level_attributes
}]
}
# Check if the file exists
if os.path.exists(output_filename):
mode = "a"
print(f"File {output_filename} exists. Opening in append mode.")
else:
mode = "w"
print(f"File {output_filename} does not exist. Creating a new file.")
# Write to HDF5
with h5py.File(output_filename, mode) as h5file:
# Add project level attributes at the root/top level
h5file.attrs.update(project_level_attributes)
__transfer_file_dict_to_hdf5(h5file, '/', file_dict)
#if __name__ == '__main__':

View File

@ -1,270 +1,270 @@
import pandas as pd
import logging
import os
import datetime
from pybis import Openbis
import hidden
admissible_props_list = ['$name', 'filenumber', 'default_experiment.experimental_results',
'dataquality', '$xmlcomments', '$annotations_state',
'sample_name', 'position_x', 'position_y', 'position_z', 'temp', 'cell_pressure', 'gas_flow_setting', 'sample_notes',
'beamline', 'photon_energy', 'slit_entrance_v', 'slit_exit_v', 'izero',
'slit_exit_h', 'hos', 'cone', 'endstation', 'hof',
'method_name', 'region', 'lens_mode', 'acq_mode', 'dwell_time', 'frames', 'passenergy',
'iterations', 'sequenceiterations', 'ke_range_center', 'ke_step']
def initialize_openbis_obj():
# TODO: implement a more secure authentication method.
openbis_obj = Openbis('https://openbis-psi.labnotebook.ch/openbis/webapp/eln-lims/?menuUniqueId=null&viewName=showBlancPage&viewData=null', verify_certificates=False)
openbis_obj.login(hidden.username,hidden.password)
return openbis_obj
def align_datetime_observation_windows(df_h5: pd.DataFrame, df_openbis: pd.DataFrame, h5_datetime_var: str = 'lastModifiedDatestr', ob_datetime_var: str = 'registrationDate') -> pd.DataFrame:
""" returns filtered/reduced versions of 'df' and 'df_ref' with aligned datetime observation windows.
That is, the datetime variable range is the same for the returned dataframes."""
#""returns a filtered or reduced version of 'df' by removing all rows that are outside the datetime variable overlapping region between 'df' and 'df_ref'.
#"""
#df_h5['lastModifiedDatestr'] = df_h5['lastModifiedDatestr'].astype('datetime64[ns]')
#df_h5 = df_h5.sort_values(by='lastModifiedDatestr')
if not (h5_datetime_var in df_h5.columns.to_list() and ob_datetime_var in df_openbis.columns.to_list()):
#TODO: Check if ValueError is the best type of error to raise here
raise ValueError("Dataframes 'df' and 'df_ref' must contain columns 'datetime_var' and 'datetime_var_ref', storing values in suitable datetime string format (e.g., yyyy-mm-dd hh:mm:ss).")
df_h5[h5_datetime_var] = df_h5[h5_datetime_var].astype('datetime64[ns]')
df_openbis[ob_datetime_var] = df_openbis[ob_datetime_var].astype('datetime64[ns]')
min_timestamp = max([df_openbis[ob_datetime_var].min(), df_h5[h5_datetime_var].min()])
max_timestamp = min([df_openbis[ob_datetime_var].max(), df_h5[h5_datetime_var].max()])
# Determine overlap between df and df_ref, and filters out all rows from df with datetime variable outside the overlapping datetime region.
datetime_overlap_indicator = (df_h5[h5_datetime_var] >= min_timestamp) & (df_h5[h5_datetime_var] <= max_timestamp)
df_h5 = df_h5.loc[datetime_overlap_indicator,:]
datetime_overlap_indicator = (df_openbis[ob_datetime_var] >= min_timestamp) & (df_openbis[ob_datetime_var] <= max_timestamp)
df_openbis = df_openbis.loc[datetime_overlap_indicator,:]
df_h5 = df_h5.sort_values(by=h5_datetime_var)
df_openbis = df_openbis.sort_values(by=ob_datetime_var)
return df_h5, df_openbis
def reformat_openbis_dataframe_filenumber(df_openbis):
if not 'FILENUMBER' in df_openbis.columns:
raise ValueError('df_openbis does not contain the column "FILENUMBER". Make sure you query (e.g., o.get_samples(props=["filenumbe"])) that before creating df_openbis.')
#if not 'name' in df.columns:
# raise ValueError("df does not contain the column 'name'. Ensure df complies with Throsten's Table's format.")
# Augment df_openbis with 'name' column consitent with Thorsten's naming convention
name_list = ['0' + item.zfill(3) + item.zfill(3) for item in df_openbis['FILENUMBER']]
df_openbis['REFORMATED_FILENUMBER'] = pd.Series(name_list, index=df_openbis.index)
return df_openbis
def pair_openbis_and_h5_dataframes(df_openbis, df_h5, pairing_ob_var: str, pairing_h5_var: str):
""" Pairs every row (or openbis sample) in 'df_openbis' with a set of rows (or measurements) in 'df_h5' by matching the i-th row in 'df_h5'
with the rows of 'df_h5' that satisfy the string df_openbis.loc[i,pairing_var_1] is contained in the string df_h5[i,pairing_var_2]
Example: pairing_var_1, pairing_var_2 = reformated 'REFORMATED_FILENUMBER', 'name'
"""
# Reformat openbis dataframe filenumber so that it can be used to find associated measurements in h5 dataframe
df_openbis = reformat_openbis_dataframe_filenumber(df_openbis)
related_indices_list = []
for sample_idx in df_openbis.index:
sample_value = df_openbis.loc[sample_idx,pairing_ob_var]
tmp_list = [sample_value in item[0:item.find('_')] for item in df_h5[pairing_h5_var]]
related_indices_list.append(df_h5.index[tmp_list])
print('Paring openbis sample: ' + df_openbis.loc[sample_idx,pairing_ob_var])
print('with reformated FILENUMBER: ' + sample_value)
print('to following measurements in h5 dataframe:')
print(df_h5.loc[df_h5.index[tmp_list],'name'])
print('\n')
df_openbis['related_h5_indices'] = pd.Series(related_indices_list, index=df_openbis.index)
return df_openbis
def range_cols_2_string(df,lb_var,ub_var):
if not sum(df.loc[:,ub_var]-df.loc[:,lb_var])==0:
#tmp_list = ['-'.join([str(round(df.loc[i,lb_var],2)),str(round(df.loc[i,ub_var],1))]) for i in df.index]
tmp_list = ['-'.join(["{:.1f}".format(df.loc[i,lb_var]),"{:.1f}".format(df.loc[i,ub_var])]) for i in df.index]
elif len(df.loc[:,lb_var].unique())>1: # check if values are different
#tmp_list = [str(round(df.loc[i,lb_var],2)) for i in df.index]
tmp_list = ["{:.1f}".format(df.loc[i,lb_var]) for i in df.index]
else:
#tmp_list = [str(round(df.loc[0,lb_var],2))]
tmp_list = ["{:.1f}".format(df.loc[0,lb_var])]
return '/'.join(tmp_list)
def col_2_string(df,column_var):
if not column_var in df.columns:
raise ValueError("'column var must belong in df.columns")
#tmp_list = [str(round(item,1)) for item in df[column_var]]
tmp_list = ["{:.2f}".format(item) for item in df[column_var]]
if len(df[column_var].unique())==1:
tmp_list = [tmp_list[0]]
return '/'.join(tmp_list)
def compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx):
prop2attr = {'sample_name':'sample', # ask Throsten whether this assignment is correct or not
'position_x':'smplX_mm',
'position_y':'smplY_mm',
'position_z':'smplZ_mm',
'temp':'sampleTemp_dC',
'cell_pressure':'cellPressure_mbar',
#'gas_flow_setting': '',
'method_name':'regionName', # measurement type: XPS or NEXAFS
'region':'regionName', # VB/N1s/C1s
'passenergy':'regionName', # REAL
'photon_energy':'xRayEkinRange_eV',
'dwell_time':'scientaDwellTime_ms',
'acq_mode':'scientaAcquisitionMode',
'ke_range_center':'scientaEkinRange_eV',
'ke_step':'scientaEkinStep_eV',
'lens_mode':'scientaLensMode'
}
sample_identifier = df_openbis.loc[sample_idx,'identifier']
props_dict = {'FILENUMBER' : df_openbis.loc[sample_idx,'FILENUMBER']}
#props_dict = {}
if not len(df_openbis.loc[sample_idx,'related_h5_indices']):
props_dict['identifier'] = sample_identifier
return props_dict
reduced_df_h5 = df_h5.loc[df_openbis.loc[sample_idx,'related_h5_indices'],:]
reduced_df_h5 = reduced_df_h5.reset_index()
# include related_samples key for validation purposes. Related samples are used to compute average and/or combined openbis properties.
related_sample_list = [reduced_df_h5['name'][index] for index in reduced_df_h5['name'].index]
related_samples = ' / '.join(related_sample_list)
props_dict['Subject_samples'] = related_samples
props_dict['sample_name'] = reduced_df_h5['sample'].unique()[0] if len(reduced_df_h5['sample'].unique())==1 else '/'.join(reduced_df_h5['sample'].tolist())
if not 'NEXAFS' in reduced_df_h5['regionName'].iloc[0]:
props_dict['identifier'] = sample_identifier
props_dict['method_name'] = 'XPS'
for item_idx in reduced_df_h5.index:
item = reduced_df_h5.loc[item_idx,'regionName']
if item_idx > 0:
props_dict['region'] = props_dict['region'] + '/' + item[0:item.find('_')]
#props_dict['dwell_time'] = props_dict['dwell_time'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
#props_dict['ke_range_center'] = props_dict['ke_range_center'] + '/' + str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
#props_dict['ke_step_center'] = props_dict['ke_step_center'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
#props_dict['passenergy'].append(float(item[item.find('_')+1:item.find('eV')]))
else:
props_dict['region'] = item[0:item.find('_')]
#props_dict['dwell_time'] = str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
#props_dict['ke_range_center'] = str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
#props_dict['ke_step_center'] = str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
#props_dict['passenergy'] = reduced_df_h5.loc[:,'scientaPassEnergy_eV'].min()
else:
props_dict = {'identifier':sample_identifier,'method_name':'NEXAFS'}
#props_dict['temp'] = round(reduced_df_h5['sampleTemp_dC'].mean(),2)
#props_dict['cell_pressure'] = round(reduced_df_h5['cellPressure_mbar'].mean(),2)
props_dict['temp'] = "{:.2f}".format(reduced_df_h5['sampleTemp_dC'].mean())
props_dict['cell_pressure'] = "{:.2f}".format(reduced_df_h5['cellPressure_mbar'].mean())
reduced_df_h5['scientaDwellTime_ms'] = reduced_df_h5['scientaDwellTime_ms']*1e-3 # covert ms to seconds
props_dict['dwell_time'] = col_2_string(reduced_df_h5,'scientaDwellTime_ms')
props_dict['passenergy'] = col_2_string(reduced_df_h5,'scientaPassEnergy_eV')
props_dict['ke_step_center'] = col_2_string(reduced_df_h5,'scientaEkinStep_eV')
#props_dict['photon_energy'] =round(reduced_df_h5[['xRayEkinRange_eV_1','xRayEkinRange_eV_2']].mean(axis=1)[0],2)
props_dict['photon_energy'] = range_cols_2_string(reduced_df_h5,'xRayEkinRange_eV_1','xRayEkinRange_eV_2')
props_dict['ke_range_center'] = range_cols_2_string(reduced_df_h5,'scientaEkinRange_eV_1','scientaEkinRange_eV_2')
props_dict['lens_mode'] = reduced_df_h5['scientaLensMode'][0]
props_dict['acq_mode'] = reduced_df_h5['scientaAcquisitionMode'][0]
props_dict['position_x'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplX_mm'].mean()) # round(reduced_df_h5.loc[:,'smplX_mm'].mean(),2)
props_dict['position_y'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplY_mm'].mean())
props_dict['position_z'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplZ_mm'].mean())
return props_dict
def single_sample_update(sample_props_dict,sample_collection,props_include_list):
""" Updates sample in openbis database specified in sample_props_dict, which must belong in sample_collection (i.e., result of openbis_obj.get_samples(...)) """
try:
sample_path_identifier = sample_props_dict['identifier'] #path-like index
sample = sample_collection[sample_path_identifier]
for prop in sample_props_dict.keys():
if (prop in admissible_props_list) and (prop in props_include_list):
sample.props[prop] = sample_props_dict[prop]
sample.save()
except Exception:
logging.error(Exception)
return 0
def sample_batch_update(openbis_obj,sample_collection,df_openbis,df_h5,props_include_list):
""" See """
if not 'related_h5_indices' in df_openbis.columns:
raise ValueError("Input dataframe 'df_openbis' must contain a column named 'related_h5_indeces', resulting from suitable proprocessing steps.")
# TODO: as a safeguard, create exclude list containing properties that must not be changed
exclude_list = ['filenumber','FILENUMBER','identifier']
for item in props_include_list:
if item in exclude_list:
props_include_list.remove(item)
trans = openbis_obj.new_transaction()
for sample_idx in len(range(df_openbis['identifier'])):
props_dict = compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx)
sample_path_identifier = props_dict['identifier'] #path-like index
sample = sample_collection[sample_path_identifier]
for prop in props_dict.keys():
if prop in props_include_list:
sample.props[prop] = props_dict[prop]
trans.add(sample)
trans.commit()
return 0
def conduct_dataframe_preprocessing_steps(df_h5, df_openbis):
if not 'lastModifiedDatestr'in df_h5.columns:
raise ValueError('')
df_h5, df_openbis = align_datetime_observation_windows(df_h5, df_openbis, 'lastModifiedDatestr' , 'registrationDate')
df_openbis = pair_openbis_and_h5_dataframes(df_openbis, df_h5, 'REFORMATED_FILENUMBER', 'name')
return df_h5, df_openbis
import pandas as pd
import logging
import os
import datetime
from pybis import Openbis
import hidden
admissible_props_list = ['$name', 'filenumber', 'default_experiment.experimental_results',
'dataquality', '$xmlcomments', '$annotations_state',
'sample_name', 'position_x', 'position_y', 'position_z', 'temp', 'cell_pressure', 'gas_flow_setting', 'sample_notes',
'beamline', 'photon_energy', 'slit_entrance_v', 'slit_exit_v', 'izero',
'slit_exit_h', 'hos', 'cone', 'endstation', 'hof',
'method_name', 'region', 'lens_mode', 'acq_mode', 'dwell_time', 'frames', 'passenergy',
'iterations', 'sequenceiterations', 'ke_range_center', 'ke_step']
def initialize_openbis_obj():
# TODO: implement a more secure authentication method.
openbis_obj = Openbis('https://openbis-psi.labnotebook.ch/openbis/webapp/eln-lims/?menuUniqueId=null&viewName=showBlancPage&viewData=null', verify_certificates=False)
openbis_obj.login(hidden.username,hidden.password)
return openbis_obj
def align_datetime_observation_windows(df_h5: pd.DataFrame, df_openbis: pd.DataFrame, h5_datetime_var: str = 'lastModifiedDatestr', ob_datetime_var: str = 'registrationDate') -> pd.DataFrame:
""" returns filtered/reduced versions of 'df' and 'df_ref' with aligned datetime observation windows.
That is, the datetime variable range is the same for the returned dataframes."""
#""returns a filtered or reduced version of 'df' by removing all rows that are outside the datetime variable overlapping region between 'df' and 'df_ref'.
#"""
#df_h5['lastModifiedDatestr'] = df_h5['lastModifiedDatestr'].astype('datetime64[ns]')
#df_h5 = df_h5.sort_values(by='lastModifiedDatestr')
if not (h5_datetime_var in df_h5.columns.to_list() and ob_datetime_var in df_openbis.columns.to_list()):
#TODO: Check if ValueError is the best type of error to raise here
raise ValueError("Dataframes 'df' and 'df_ref' must contain columns 'datetime_var' and 'datetime_var_ref', storing values in suitable datetime string format (e.g., yyyy-mm-dd hh:mm:ss).")
df_h5[h5_datetime_var] = df_h5[h5_datetime_var].astype('datetime64[ns]')
df_openbis[ob_datetime_var] = df_openbis[ob_datetime_var].astype('datetime64[ns]')
min_timestamp = max([df_openbis[ob_datetime_var].min(), df_h5[h5_datetime_var].min()])
max_timestamp = min([df_openbis[ob_datetime_var].max(), df_h5[h5_datetime_var].max()])
# Determine overlap between df and df_ref, and filters out all rows from df with datetime variable outside the overlapping datetime region.
datetime_overlap_indicator = (df_h5[h5_datetime_var] >= min_timestamp) & (df_h5[h5_datetime_var] <= max_timestamp)
df_h5 = df_h5.loc[datetime_overlap_indicator,:]
datetime_overlap_indicator = (df_openbis[ob_datetime_var] >= min_timestamp) & (df_openbis[ob_datetime_var] <= max_timestamp)
df_openbis = df_openbis.loc[datetime_overlap_indicator,:]
df_h5 = df_h5.sort_values(by=h5_datetime_var)
df_openbis = df_openbis.sort_values(by=ob_datetime_var)
return df_h5, df_openbis
def reformat_openbis_dataframe_filenumber(df_openbis):
if not 'FILENUMBER' in df_openbis.columns:
raise ValueError('df_openbis does not contain the column "FILENUMBER". Make sure you query (e.g., o.get_samples(props=["filenumbe"])) that before creating df_openbis.')
#if not 'name' in df.columns:
# raise ValueError("df does not contain the column 'name'. Ensure df complies with Throsten's Table's format.")
# Augment df_openbis with 'name' column consitent with Thorsten's naming convention
name_list = ['0' + item.zfill(3) + item.zfill(3) for item in df_openbis['FILENUMBER']]
df_openbis['REFORMATED_FILENUMBER'] = pd.Series(name_list, index=df_openbis.index)
return df_openbis
def pair_openbis_and_h5_dataframes(df_openbis, df_h5, pairing_ob_var: str, pairing_h5_var: str):
""" Pairs every row (or openbis sample) in 'df_openbis' with a set of rows (or measurements) in 'df_h5' by matching the i-th row in 'df_h5'
with the rows of 'df_h5' that satisfy the string df_openbis.loc[i,pairing_var_1] is contained in the string df_h5[i,pairing_var_2]
Example: pairing_var_1, pairing_var_2 = reformated 'REFORMATED_FILENUMBER', 'name'
"""
# Reformat openbis dataframe filenumber so that it can be used to find associated measurements in h5 dataframe
df_openbis = reformat_openbis_dataframe_filenumber(df_openbis)
related_indices_list = []
for sample_idx in df_openbis.index:
sample_value = df_openbis.loc[sample_idx,pairing_ob_var]
tmp_list = [sample_value in item[0:item.find('_')] for item in df_h5[pairing_h5_var]]
related_indices_list.append(df_h5.index[tmp_list])
print('Paring openbis sample: ' + df_openbis.loc[sample_idx,pairing_ob_var])
print('with reformated FILENUMBER: ' + sample_value)
print('to following measurements in h5 dataframe:')
print(df_h5.loc[df_h5.index[tmp_list],'name'])
print('\n')
df_openbis['related_h5_indices'] = pd.Series(related_indices_list, index=df_openbis.index)
return df_openbis
def range_cols_2_string(df,lb_var,ub_var):
if not sum(df.loc[:,ub_var]-df.loc[:,lb_var])==0:
#tmp_list = ['-'.join([str(round(df.loc[i,lb_var],2)),str(round(df.loc[i,ub_var],1))]) for i in df.index]
tmp_list = ['-'.join(["{:.1f}".format(df.loc[i,lb_var]),"{:.1f}".format(df.loc[i,ub_var])]) for i in df.index]
elif len(df.loc[:,lb_var].unique())>1: # check if values are different
#tmp_list = [str(round(df.loc[i,lb_var],2)) for i in df.index]
tmp_list = ["{:.1f}".format(df.loc[i,lb_var]) for i in df.index]
else:
#tmp_list = [str(round(df.loc[0,lb_var],2))]
tmp_list = ["{:.1f}".format(df.loc[0,lb_var])]
return '/'.join(tmp_list)
def col_2_string(df,column_var):
if not column_var in df.columns:
raise ValueError("'column var must belong in df.columns")
#tmp_list = [str(round(item,1)) for item in df[column_var]]
tmp_list = ["{:.2f}".format(item) for item in df[column_var]]
if len(df[column_var].unique())==1:
tmp_list = [tmp_list[0]]
return '/'.join(tmp_list)
def compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx):
prop2attr = {'sample_name':'sample', # ask Throsten whether this assignment is correct or not
'position_x':'smplX_mm',
'position_y':'smplY_mm',
'position_z':'smplZ_mm',
'temp':'sampleTemp_dC',
'cell_pressure':'cellPressure_mbar',
#'gas_flow_setting': '',
'method_name':'regionName', # measurement type: XPS or NEXAFS
'region':'regionName', # VB/N1s/C1s
'passenergy':'regionName', # REAL
'photon_energy':'xRayEkinRange_eV',
'dwell_time':'scientaDwellTime_ms',
'acq_mode':'scientaAcquisitionMode',
'ke_range_center':'scientaEkinRange_eV',
'ke_step':'scientaEkinStep_eV',
'lens_mode':'scientaLensMode'
}
sample_identifier = df_openbis.loc[sample_idx,'identifier']
props_dict = {'FILENUMBER' : df_openbis.loc[sample_idx,'FILENUMBER']}
#props_dict = {}
if not len(df_openbis.loc[sample_idx,'related_h5_indices']):
props_dict['identifier'] = sample_identifier
return props_dict
reduced_df_h5 = df_h5.loc[df_openbis.loc[sample_idx,'related_h5_indices'],:]
reduced_df_h5 = reduced_df_h5.reset_index()
# include related_samples key for validation purposes. Related samples are used to compute average and/or combined openbis properties.
related_sample_list = [reduced_df_h5['name'][index] for index in reduced_df_h5['name'].index]
related_samples = ' / '.join(related_sample_list)
props_dict['Subject_samples'] = related_samples
props_dict['sample_name'] = reduced_df_h5['sample'].unique()[0] if len(reduced_df_h5['sample'].unique())==1 else '/'.join(reduced_df_h5['sample'].tolist())
if not 'NEXAFS' in reduced_df_h5['regionName'].iloc[0]:
props_dict['identifier'] = sample_identifier
props_dict['method_name'] = 'XPS'
for item_idx in reduced_df_h5.index:
item = reduced_df_h5.loc[item_idx,'regionName']
if item_idx > 0:
props_dict['region'] = props_dict['region'] + '/' + item[0:item.find('_')]
#props_dict['dwell_time'] = props_dict['dwell_time'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
#props_dict['ke_range_center'] = props_dict['ke_range_center'] + '/' + str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
#props_dict['ke_step_center'] = props_dict['ke_step_center'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
#props_dict['passenergy'].append(float(item[item.find('_')+1:item.find('eV')]))
else:
props_dict['region'] = item[0:item.find('_')]
#props_dict['dwell_time'] = str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
#props_dict['ke_range_center'] = str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
#props_dict['ke_step_center'] = str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
#props_dict['passenergy'] = reduced_df_h5.loc[:,'scientaPassEnergy_eV'].min()
else:
props_dict = {'identifier':sample_identifier,'method_name':'NEXAFS'}
#props_dict['temp'] = round(reduced_df_h5['sampleTemp_dC'].mean(),2)
#props_dict['cell_pressure'] = round(reduced_df_h5['cellPressure_mbar'].mean(),2)
props_dict['temp'] = "{:.2f}".format(reduced_df_h5['sampleTemp_dC'].mean())
props_dict['cell_pressure'] = "{:.2f}".format(reduced_df_h5['cellPressure_mbar'].mean())
reduced_df_h5['scientaDwellTime_ms'] = reduced_df_h5['scientaDwellTime_ms']*1e-3 # covert ms to seconds
props_dict['dwell_time'] = col_2_string(reduced_df_h5,'scientaDwellTime_ms')
props_dict['passenergy'] = col_2_string(reduced_df_h5,'scientaPassEnergy_eV')
props_dict['ke_step_center'] = col_2_string(reduced_df_h5,'scientaEkinStep_eV')
#props_dict['photon_energy'] =round(reduced_df_h5[['xRayEkinRange_eV_1','xRayEkinRange_eV_2']].mean(axis=1)[0],2)
props_dict['photon_energy'] = range_cols_2_string(reduced_df_h5,'xRayEkinRange_eV_1','xRayEkinRange_eV_2')
props_dict['ke_range_center'] = range_cols_2_string(reduced_df_h5,'scientaEkinRange_eV_1','scientaEkinRange_eV_2')
props_dict['lens_mode'] = reduced_df_h5['scientaLensMode'][0]
props_dict['acq_mode'] = reduced_df_h5['scientaAcquisitionMode'][0]
props_dict['position_x'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplX_mm'].mean()) # round(reduced_df_h5.loc[:,'smplX_mm'].mean(),2)
props_dict['position_y'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplY_mm'].mean())
props_dict['position_z'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplZ_mm'].mean())
return props_dict
def single_sample_update(sample_props_dict,sample_collection,props_include_list):
""" Updates sample in openbis database specified in sample_props_dict, which must belong in sample_collection (i.e., result of openbis_obj.get_samples(...)) """
try:
sample_path_identifier = sample_props_dict['identifier'] #path-like index
sample = sample_collection[sample_path_identifier]
for prop in sample_props_dict.keys():
if (prop in admissible_props_list) and (prop in props_include_list):
sample.props[prop] = sample_props_dict[prop]
sample.save()
except Exception:
logging.error(Exception)
return 0
def sample_batch_update(openbis_obj,sample_collection,df_openbis,df_h5,props_include_list):
""" See """
if not 'related_h5_indices' in df_openbis.columns:
raise ValueError("Input dataframe 'df_openbis' must contain a column named 'related_h5_indeces', resulting from suitable proprocessing steps.")
# TODO: as a safeguard, create exclude list containing properties that must not be changed
exclude_list = ['filenumber','FILENUMBER','identifier']
for item in props_include_list:
if item in exclude_list:
props_include_list.remove(item)
trans = openbis_obj.new_transaction()
for sample_idx in len(range(df_openbis['identifier'])):
props_dict = compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx)
sample_path_identifier = props_dict['identifier'] #path-like index
sample = sample_collection[sample_path_identifier]
for prop in props_dict.keys():
if prop in props_include_list:
sample.props[prop] = props_dict[prop]
trans.add(sample)
trans.commit()
return 0
def conduct_dataframe_preprocessing_steps(df_h5, df_openbis):
if not 'lastModifiedDatestr'in df_h5.columns:
raise ValueError('')
df_h5, df_openbis = align_datetime_observation_windows(df_h5, df_openbis, 'lastModifiedDatestr' , 'registrationDate')
df_openbis = pair_openbis_and_h5_dataframes(df_openbis, df_h5, 'REFORMATED_FILENUMBER', 'name')
return df_h5, df_openbis

View File

@ -1,58 +1,58 @@
import scipy.optimize as sp_opt
import pandas as pd
def construct_mask(x, subinterval_list):
""" constructs a mask of length len(x) that indicates whether the entries of x lie within the subintervals,
speficified in the subinterval_list.
Parameters:
x (array_like):
subinterval_list (list of two-element tuples):
Returns:
mask (Bool array_like):
Usage:
x = np.array([0.0 0.25 0.5 0.75 1.5 2.0 2.5 3.0 3.5 4.0])
subinterval_list = [(0.25,0.75),(2.5,3.5)]
mask = contruct_mask(x,subinterval_list)
"""
mask = x < x.min()
for subinterval in subinterval_list:
mask = mask | ((x >= subinterval[0]) & (x <= subinterval[1]))
return mask
def estimate_background(x,y,mask,method: str):
"""fits a background model based on the values of x and y indicated by a mask using a method, among available options.
Parameters:
x,y (array_like, e.g., np.array, pd.Series):
mask (Bool array_like):
method (str):
Returns:
y_bg (array_like): values of the fitted model at x, or similarly the obtained background estimate
"""
if method == 'linear':
def linear_model(x,m,b):
return (m*x) + b
popt, pcov = sp_opt.curve_fit(linear_model,x[mask],y[mask])
y_bg = linear_model(x,*popt)
else:
raise ValueError("Parameter 'method' can only be set as 'linear'. Future code releases may include more options. ")
return y_bg
import scipy.optimize as sp_opt
import pandas as pd
def construct_mask(x, subinterval_list):
""" constructs a mask of length len(x) that indicates whether the entries of x lie within the subintervals,
speficified in the subinterval_list.
Parameters:
x (array_like):
subinterval_list (list of two-element tuples):
Returns:
mask (Bool array_like):
Usage:
x = np.array([0.0 0.25 0.5 0.75 1.5 2.0 2.5 3.0 3.5 4.0])
subinterval_list = [(0.25,0.75),(2.5,3.5)]
mask = contruct_mask(x,subinterval_list)
"""
mask = x < x.min()
for subinterval in subinterval_list:
mask = mask | ((x >= subinterval[0]) & (x <= subinterval[1]))
return mask
def estimate_background(x,y,mask,method: str):
"""fits a background model based on the values of x and y indicated by a mask using a method, among available options.
Parameters:
x,y (array_like, e.g., np.array, pd.Series):
mask (Bool array_like):
method (str):
Returns:
y_bg (array_like): values of the fitted model at x, or similarly the obtained background estimate
"""
if method == 'linear':
def linear_model(x,m,b):
return (m*x) + b
popt, pcov = sp_opt.curve_fit(linear_model,x[mask],y[mask])
y_bg = linear_model(x,*popt)
else:
raise ValueError("Parameter 'method' can only be set as 'linear'. Future code releases may include more options. ")
return y_bg