Synch with remote repo
This commit is contained in:
716
src/git_ops.py
716
src/git_ops.py
@ -1,358 +1,358 @@
|
||||
import subprocess
|
||||
import os
|
||||
import utils.g5505_utils as utils
|
||||
from pipelines.metadata_revision import update_hdf5_file_with_review
|
||||
|
||||
def perform_git_operations(hdf5_upload):
|
||||
status_command = ['git', 'status']
|
||||
status = subprocess.run(status_command, capture_output=True, check=True)
|
||||
|
||||
if hdf5_upload:
|
||||
upload_ext = ['.h5', '.yaml']
|
||||
else:
|
||||
upload_ext = ['.yaml']
|
||||
|
||||
files_to_add_list = extract_files_to_add(status.stdout, upload_ext)
|
||||
if files_to_add_list:
|
||||
add_files_to_git(files_to_add_list)
|
||||
commit_changes('Updated hdf5 file with yaml review file.')
|
||||
else:
|
||||
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
|
||||
|
||||
def extract_files_to_add(git_status_output, upload_ext):
|
||||
files_to_add_list = []
|
||||
for line in git_status_output.splitlines():
|
||||
tmp = line.decode("utf-8")
|
||||
if 'modified' in tmp:
|
||||
if any(ext in tmp for ext in upload_ext):
|
||||
files_to_add_list.append(tmp.split()[1])
|
||||
return files_to_add_list
|
||||
|
||||
def add_files_to_git(files_to_add_list):
|
||||
add_command = ['git', 'add'] + files_to_add_list
|
||||
subprocess.run(add_command, capture_output=True, check=True)
|
||||
|
||||
def commit_changes(message):
|
||||
commit_command = ['git', 'commit', '-m', message]
|
||||
commit_output = subprocess.run(commit_command, capture_output=True, check=True)
|
||||
print(commit_output.stdout)
|
||||
|
||||
def get_status():
|
||||
return subprocess.run(['git','status'],capture_output=True,text=True,check=True)
|
||||
|
||||
def show_current_branch():
|
||||
current_branch_command = ['git','branch','--show-current']
|
||||
subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
|
||||
|
||||
|
||||
|
||||
YAML_EXT = ".yaml"
|
||||
TXT_EXT = ".txt"
|
||||
|
||||
|
||||
|
||||
def get_review_status(filename_path):
|
||||
|
||||
filename_path_tail, filename_path_head = os.path.split(filename_path)
|
||||
filename, ext = os.path.splitext(filename_path_head)
|
||||
# TODO:
|
||||
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
|
||||
workflow_steps = []
|
||||
for line in f:
|
||||
workflow_steps.append(line)
|
||||
return workflow_steps[-1]
|
||||
|
||||
def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
|
||||
|
||||
"""
|
||||
First: Initialize review branch with review folder with a copy of yaml representation of
|
||||
hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
|
||||
|
||||
"""
|
||||
|
||||
initials = reviewer_attrs['initials']
|
||||
#branch_name = '-'.join([reviewer_attrs['type'],'review_',initials])
|
||||
branch_name = '_'.join(['review',initials])
|
||||
|
||||
hdf5_file_path_tail, filename_path_head = os.path.split(hdf5_file_path)
|
||||
filename, ext = os.path.splitext(filename_path_head)
|
||||
|
||||
# Check file_path points to h5 file
|
||||
if not 'h5' in ext:
|
||||
raise ValueError("filename_path needs to point to an h5 file.")
|
||||
|
||||
# Verify if yaml snapshot of input h5 file exists
|
||||
if not os.path.exists(os.path.join(hdf5_file_path_tail,filename+YAML_EXT)):
|
||||
raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run serialize_metadata(filename_path) ")
|
||||
|
||||
# Initialize metadata review workflow
|
||||
# print("Create branch metadata-review-by-"+initials+"\n")
|
||||
|
||||
#checkout_review_branch(branch_name)
|
||||
|
||||
# Check you are working at the right branch
|
||||
|
||||
curr_branch = show_current_branch()
|
||||
if not branch_name in curr_branch.stdout:
|
||||
raise ValueError("Branch "+branch_name+" was not found. \nPlease open a Git Bash Terminal, and follow the below instructions: \n1. Change directory to your project's directory. \n2. Excecute the command: git checkout "+branch_name)
|
||||
|
||||
# Check if review file already exists and then check if it is still untracked
|
||||
review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
|
||||
review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)
|
||||
review_status_yaml_file_path = os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")
|
||||
|
||||
if not os.path.exists(review_yaml_file_path) or restart:
|
||||
review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review')
|
||||
if restart:
|
||||
print('metadata review has been reinitialized. The review files will reflect the current state of the hdf5 files metadata')
|
||||
|
||||
|
||||
|
||||
#if not os.path.exists(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")):
|
||||
|
||||
with open(review_status_yaml_file_path,'w') as f:
|
||||
f.write('under review')
|
||||
|
||||
# Stage untracked review files and commit them to local repository
|
||||
status = get_status()
|
||||
untracked_files = []
|
||||
for line in status.stdout.splitlines():
|
||||
#tmp = line.decode("utf-8")
|
||||
#modified_files.append(tmp.split()[1])
|
||||
if 'review/' in line:
|
||||
if not 'modified' in line: # untracked filesand
|
||||
untracked_files.append(line.strip())
|
||||
else:
|
||||
untracked_files.append(line.strip().split()[1])
|
||||
|
||||
if 'output_files/'+filename+YAML_EXT in line and not 'modified' in line:
|
||||
untracked_files.append(line.strip())
|
||||
|
||||
if untracked_files:
|
||||
result = subprocess.run(add_files_to_git(untracked_files),capture_output=True,check=True)
|
||||
message = 'Initialized metadata review.'
|
||||
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
|
||||
|
||||
for line in commit_output.stdout.splitlines():
|
||||
print(line.decode('utf-8'))
|
||||
#else:
|
||||
# print('This action will not have any effect because metadata review process has been already initialized.')
|
||||
|
||||
|
||||
|
||||
|
||||
#status_dict = repo_obj.status()
|
||||
#for filepath, file_status in status_dict.items():
|
||||
# Identify keys associated to review files and stage them
|
||||
# if 'review/'+filename in filepath:
|
||||
# Stage changes
|
||||
# repo_obj.index.add(filepath)
|
||||
|
||||
#author = config_file.author #default_signature
|
||||
#committer = config_file.committer
|
||||
#message = "Initialized metadata review process."
|
||||
#tree = repo_obj.index.write_tree()
|
||||
#oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
|
||||
|
||||
#print("Add and commit"+"\n")
|
||||
|
||||
return review_yaml_file_path, review_status_yaml_file_path
|
||||
|
||||
|
||||
|
||||
def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
|
||||
"""
|
||||
Second: Once you're done reviewing the yaml representation of hdf5 file in review folder.
|
||||
Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
|
||||
running this function.
|
||||
|
||||
"""
|
||||
# 1 verify review initializatin was performed first
|
||||
# 2. change review status in txt to complete
|
||||
# 3. git add review/ and git commit -m "Submitted metadata review"
|
||||
|
||||
initials = reviewer_attrs['initials']
|
||||
#branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
|
||||
branch_name = '_'.join(['review',initials])
|
||||
# TODO: replace with subprocess + git
|
||||
#checkout_review_branch(repo_obj, branch_name)
|
||||
|
||||
# Check you are working at the right branch
|
||||
curr_branch = show_current_branch()
|
||||
if not branch_name in curr_branch.stdout:
|
||||
raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
|
||||
|
||||
# Collect modified review files
|
||||
status = get_status()
|
||||
modified_files = []
|
||||
os.path.basename(review_yaml_file_path)
|
||||
for line in status.stdout.splitlines():
|
||||
# conver line from bytes to str
|
||||
tmp = line.decode("utf-8")
|
||||
if 'modified' in tmp and 'review/' in tmp and os.path.basename(review_yaml_file_path) in tmp:
|
||||
modified_files.append(tmp.split()[1])
|
||||
|
||||
# Stage modified files and commit them to local repository
|
||||
review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
|
||||
filename, ext = os.path.splitext(review_yaml_file_path_head)
|
||||
if modified_files:
|
||||
review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
|
||||
with open(review_status_file_path,'a') as f:
|
||||
f.write('\nsubmitted')
|
||||
|
||||
modified_files.append(review_status_file_path)
|
||||
|
||||
result = subprocess.run(add_files_to_git(modified_files),capture_output=True,check=True)
|
||||
message = 'Submitted metadata review.'
|
||||
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
|
||||
|
||||
for line in commit_output.stdout.splitlines():
|
||||
print(line.decode('utf-8'))
|
||||
else:
|
||||
print('Nothing to commit.')
|
||||
|
||||
#
|
||||
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
|
||||
if 'submitted' not in get_review_status(input_hdf5_file):
|
||||
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
|
||||
|
||||
update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
|
||||
perform_git_operations(hdf5_upload)
|
||||
|
||||
def last_submit_metadata_review(reviewer_attrs):
|
||||
|
||||
"""Fourth: """
|
||||
|
||||
initials =reviewer_attrs['initials']
|
||||
|
||||
repository = 'origin'
|
||||
branch_name = '_'.join(['review',initials])
|
||||
|
||||
push_command = lambda repository,refspec: ['git','push',repository,refspec]
|
||||
|
||||
list_branches_command = ['git','branch','--list']
|
||||
|
||||
branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
|
||||
if not branch_name in branches.stdout:
|
||||
print('There is no branch named '+branch_name+'.\n')
|
||||
print('Make sure to run data owner review workflow from the beginning without missing any steps.')
|
||||
return
|
||||
|
||||
curr_branch = show_current_branch()
|
||||
if not branch_name in curr_branch.stdout:
|
||||
print('Complete metadata review could not be completed.\n')
|
||||
print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
|
||||
print('The step "Complete metadata review" will have no effect.')
|
||||
return
|
||||
|
||||
|
||||
|
||||
# push
|
||||
result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
|
||||
print(result.stdout)
|
||||
|
||||
# 1. git add output_files/
|
||||
# 2. delete review/
|
||||
#shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
|
||||
# 3. git rm review/
|
||||
# 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
|
||||
return result.returncode
|
||||
|
||||
|
||||
#import config_file
|
||||
#import hdf5_ops
|
||||
|
||||
class MetadataHarvester:
|
||||
def __init__(self, parent_files=None):
|
||||
if parent_files is None:
|
||||
parent_files = []
|
||||
self.parent_files = parent_files
|
||||
self.metadata = {
|
||||
"project": {},
|
||||
"sample": {},
|
||||
"environment": {},
|
||||
"instruments": {},
|
||||
"datasets": {}
|
||||
}
|
||||
|
||||
def add_project_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("project", key_or_dict, value, append)
|
||||
|
||||
def add_sample_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("sample", key_or_dict, value, append)
|
||||
|
||||
def add_environment_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("environment", key_or_dict, value, append)
|
||||
|
||||
def add_instrument_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("instruments", key_or_dict, value, append)
|
||||
|
||||
def add_dataset_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("datasets", key_or_dict, value, append)
|
||||
|
||||
def _add_info(self, category, key_or_dict, value, append):
|
||||
"""Internal helper method to add information to a category."""
|
||||
if isinstance(key_or_dict, dict):
|
||||
self.metadata[category].update(key_or_dict)
|
||||
else:
|
||||
if key_or_dict in self.metadata[category]:
|
||||
if append:
|
||||
current_value = self.metadata[category][key_or_dict]
|
||||
|
||||
if isinstance(current_value, list):
|
||||
|
||||
if not isinstance(value, list):
|
||||
# Append the new value to the list
|
||||
self.metadata[category][key_or_dict].append(value)
|
||||
else:
|
||||
self.metadata[category][key_or_dict] = current_value + value
|
||||
|
||||
elif isinstance(current_value, str):
|
||||
# Append the new value as a comma-separated string
|
||||
self.metadata[category][key_or_dict] = current_value + ',' + str(value)
|
||||
else:
|
||||
# Handle other types (for completeness, usually not required)
|
||||
self.metadata[category][key_or_dict] = [current_value, value]
|
||||
else:
|
||||
self.metadata[category][key_or_dict] = value
|
||||
else:
|
||||
self.metadata[category][key_or_dict] = value
|
||||
|
||||
def get_metadata(self):
|
||||
return {
|
||||
"parent_files": self.parent_files,
|
||||
"metadata": self.metadata
|
||||
}
|
||||
|
||||
def print_metadata(self):
|
||||
print("parent_files", self.parent_files)
|
||||
|
||||
for key in self.metadata.keys():
|
||||
print(key,'metadata:\n')
|
||||
for item in self.metadata[key].items():
|
||||
print(item[0],item[1])
|
||||
|
||||
|
||||
|
||||
def clear_metadata(self):
|
||||
self.metadata = {
|
||||
"project": {},
|
||||
"sample": {},
|
||||
"environment": {},
|
||||
"instruments": {},
|
||||
"datasets": {}
|
||||
}
|
||||
self.parent_files = []
|
||||
|
||||
def main():
|
||||
|
||||
output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
|
||||
output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
|
||||
output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
|
||||
#output_yml_filename_path = hdf5_ops.serialize_metadata(output_filename_path)
|
||||
|
||||
#first_initialize_metadata_review(output_filename_path,initials='NG')
|
||||
#second_submit_metadata_review()
|
||||
#if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
|
||||
# third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
|
||||
#fourth_complete_metadata_review()
|
||||
import subprocess
|
||||
import os
|
||||
import utils.g5505_utils as utils
|
||||
from pipelines.metadata_revision import update_hdf5_file_with_review
|
||||
|
||||
def perform_git_operations(hdf5_upload):
|
||||
status_command = ['git', 'status']
|
||||
status = subprocess.run(status_command, capture_output=True, check=True)
|
||||
|
||||
if hdf5_upload:
|
||||
upload_ext = ['.h5', '.yaml']
|
||||
else:
|
||||
upload_ext = ['.yaml']
|
||||
|
||||
files_to_add_list = extract_files_to_add(status.stdout, upload_ext)
|
||||
if files_to_add_list:
|
||||
add_files_to_git(files_to_add_list)
|
||||
commit_changes('Updated hdf5 file with yaml review file.')
|
||||
else:
|
||||
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
|
||||
|
||||
def extract_files_to_add(git_status_output, upload_ext):
|
||||
files_to_add_list = []
|
||||
for line in git_status_output.splitlines():
|
||||
tmp = line.decode("utf-8")
|
||||
if 'modified' in tmp:
|
||||
if any(ext in tmp for ext in upload_ext):
|
||||
files_to_add_list.append(tmp.split()[1])
|
||||
return files_to_add_list
|
||||
|
||||
def add_files_to_git(files_to_add_list):
|
||||
add_command = ['git', 'add'] + files_to_add_list
|
||||
subprocess.run(add_command, capture_output=True, check=True)
|
||||
|
||||
def commit_changes(message):
|
||||
commit_command = ['git', 'commit', '-m', message]
|
||||
commit_output = subprocess.run(commit_command, capture_output=True, check=True)
|
||||
print(commit_output.stdout)
|
||||
|
||||
def get_status():
|
||||
return subprocess.run(['git','status'],capture_output=True,text=True,check=True)
|
||||
|
||||
def show_current_branch():
|
||||
current_branch_command = ['git','branch','--show-current']
|
||||
subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
|
||||
|
||||
|
||||
|
||||
YAML_EXT = ".yaml"
|
||||
TXT_EXT = ".txt"
|
||||
|
||||
|
||||
|
||||
def get_review_status(filename_path):
|
||||
|
||||
filename_path_tail, filename_path_head = os.path.split(filename_path)
|
||||
filename, ext = os.path.splitext(filename_path_head)
|
||||
# TODO:
|
||||
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
|
||||
workflow_steps = []
|
||||
for line in f:
|
||||
workflow_steps.append(line)
|
||||
return workflow_steps[-1]
|
||||
|
||||
def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
|
||||
|
||||
"""
|
||||
First: Initialize review branch with review folder with a copy of yaml representation of
|
||||
hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
|
||||
|
||||
"""
|
||||
|
||||
initials = reviewer_attrs['initials']
|
||||
#branch_name = '-'.join([reviewer_attrs['type'],'review_',initials])
|
||||
branch_name = '_'.join(['review',initials])
|
||||
|
||||
hdf5_file_path_tail, filename_path_head = os.path.split(hdf5_file_path)
|
||||
filename, ext = os.path.splitext(filename_path_head)
|
||||
|
||||
# Check file_path points to h5 file
|
||||
if not 'h5' in ext:
|
||||
raise ValueError("filename_path needs to point to an h5 file.")
|
||||
|
||||
# Verify if yaml snapshot of input h5 file exists
|
||||
if not os.path.exists(os.path.join(hdf5_file_path_tail,filename+YAML_EXT)):
|
||||
raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run serialize_metadata(filename_path) ")
|
||||
|
||||
# Initialize metadata review workflow
|
||||
# print("Create branch metadata-review-by-"+initials+"\n")
|
||||
|
||||
#checkout_review_branch(branch_name)
|
||||
|
||||
# Check you are working at the right branch
|
||||
|
||||
curr_branch = show_current_branch()
|
||||
if not branch_name in curr_branch.stdout:
|
||||
raise ValueError("Branch "+branch_name+" was not found. \nPlease open a Git Bash Terminal, and follow the below instructions: \n1. Change directory to your project's directory. \n2. Excecute the command: git checkout "+branch_name)
|
||||
|
||||
# Check if review file already exists and then check if it is still untracked
|
||||
review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
|
||||
review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)
|
||||
review_status_yaml_file_path = os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")
|
||||
|
||||
if not os.path.exists(review_yaml_file_path) or restart:
|
||||
review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review')
|
||||
if restart:
|
||||
print('metadata review has been reinitialized. The review files will reflect the current state of the hdf5 files metadata')
|
||||
|
||||
|
||||
|
||||
#if not os.path.exists(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")):
|
||||
|
||||
with open(review_status_yaml_file_path,'w') as f:
|
||||
f.write('under review')
|
||||
|
||||
# Stage untracked review files and commit them to local repository
|
||||
status = get_status()
|
||||
untracked_files = []
|
||||
for line in status.stdout.splitlines():
|
||||
#tmp = line.decode("utf-8")
|
||||
#modified_files.append(tmp.split()[1])
|
||||
if 'review/' in line:
|
||||
if not 'modified' in line: # untracked filesand
|
||||
untracked_files.append(line.strip())
|
||||
else:
|
||||
untracked_files.append(line.strip().split()[1])
|
||||
|
||||
if 'output_files/'+filename+YAML_EXT in line and not 'modified' in line:
|
||||
untracked_files.append(line.strip())
|
||||
|
||||
if untracked_files:
|
||||
result = subprocess.run(add_files_to_git(untracked_files),capture_output=True,check=True)
|
||||
message = 'Initialized metadata review.'
|
||||
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
|
||||
|
||||
for line in commit_output.stdout.splitlines():
|
||||
print(line.decode('utf-8'))
|
||||
#else:
|
||||
# print('This action will not have any effect because metadata review process has been already initialized.')
|
||||
|
||||
|
||||
|
||||
|
||||
#status_dict = repo_obj.status()
|
||||
#for filepath, file_status in status_dict.items():
|
||||
# Identify keys associated to review files and stage them
|
||||
# if 'review/'+filename in filepath:
|
||||
# Stage changes
|
||||
# repo_obj.index.add(filepath)
|
||||
|
||||
#author = config_file.author #default_signature
|
||||
#committer = config_file.committer
|
||||
#message = "Initialized metadata review process."
|
||||
#tree = repo_obj.index.write_tree()
|
||||
#oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
|
||||
|
||||
#print("Add and commit"+"\n")
|
||||
|
||||
return review_yaml_file_path, review_status_yaml_file_path
|
||||
|
||||
|
||||
|
||||
def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
|
||||
"""
|
||||
Second: Once you're done reviewing the yaml representation of hdf5 file in review folder.
|
||||
Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
|
||||
running this function.
|
||||
|
||||
"""
|
||||
# 1 verify review initializatin was performed first
|
||||
# 2. change review status in txt to complete
|
||||
# 3. git add review/ and git commit -m "Submitted metadata review"
|
||||
|
||||
initials = reviewer_attrs['initials']
|
||||
#branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
|
||||
branch_name = '_'.join(['review',initials])
|
||||
# TODO: replace with subprocess + git
|
||||
#checkout_review_branch(repo_obj, branch_name)
|
||||
|
||||
# Check you are working at the right branch
|
||||
curr_branch = show_current_branch()
|
||||
if not branch_name in curr_branch.stdout:
|
||||
raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
|
||||
|
||||
# Collect modified review files
|
||||
status = get_status()
|
||||
modified_files = []
|
||||
os.path.basename(review_yaml_file_path)
|
||||
for line in status.stdout.splitlines():
|
||||
# conver line from bytes to str
|
||||
tmp = line.decode("utf-8")
|
||||
if 'modified' in tmp and 'review/' in tmp and os.path.basename(review_yaml_file_path) in tmp:
|
||||
modified_files.append(tmp.split()[1])
|
||||
|
||||
# Stage modified files and commit them to local repository
|
||||
review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
|
||||
filename, ext = os.path.splitext(review_yaml_file_path_head)
|
||||
if modified_files:
|
||||
review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
|
||||
with open(review_status_file_path,'a') as f:
|
||||
f.write('\nsubmitted')
|
||||
|
||||
modified_files.append(review_status_file_path)
|
||||
|
||||
result = subprocess.run(add_files_to_git(modified_files),capture_output=True,check=True)
|
||||
message = 'Submitted metadata review.'
|
||||
commit_output = subprocess.run(commit_changes(message),capture_output=True,check=True)
|
||||
|
||||
for line in commit_output.stdout.splitlines():
|
||||
print(line.decode('utf-8'))
|
||||
else:
|
||||
print('Nothing to commit.')
|
||||
|
||||
#
|
||||
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
|
||||
if 'submitted' not in get_review_status(input_hdf5_file):
|
||||
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
|
||||
|
||||
update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
|
||||
perform_git_operations(hdf5_upload)
|
||||
|
||||
def last_submit_metadata_review(reviewer_attrs):
|
||||
|
||||
"""Fourth: """
|
||||
|
||||
initials =reviewer_attrs['initials']
|
||||
|
||||
repository = 'origin'
|
||||
branch_name = '_'.join(['review',initials])
|
||||
|
||||
push_command = lambda repository,refspec: ['git','push',repository,refspec]
|
||||
|
||||
list_branches_command = ['git','branch','--list']
|
||||
|
||||
branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
|
||||
if not branch_name in branches.stdout:
|
||||
print('There is no branch named '+branch_name+'.\n')
|
||||
print('Make sure to run data owner review workflow from the beginning without missing any steps.')
|
||||
return
|
||||
|
||||
curr_branch = show_current_branch()
|
||||
if not branch_name in curr_branch.stdout:
|
||||
print('Complete metadata review could not be completed.\n')
|
||||
print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
|
||||
print('The step "Complete metadata review" will have no effect.')
|
||||
return
|
||||
|
||||
|
||||
|
||||
# push
|
||||
result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
|
||||
print(result.stdout)
|
||||
|
||||
# 1. git add output_files/
|
||||
# 2. delete review/
|
||||
#shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
|
||||
# 3. git rm review/
|
||||
# 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
|
||||
return result.returncode
|
||||
|
||||
|
||||
#import config_file
|
||||
#import hdf5_ops
|
||||
|
||||
class MetadataHarvester:
|
||||
def __init__(self, parent_files=None):
|
||||
if parent_files is None:
|
||||
parent_files = []
|
||||
self.parent_files = parent_files
|
||||
self.metadata = {
|
||||
"project": {},
|
||||
"sample": {},
|
||||
"environment": {},
|
||||
"instruments": {},
|
||||
"datasets": {}
|
||||
}
|
||||
|
||||
def add_project_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("project", key_or_dict, value, append)
|
||||
|
||||
def add_sample_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("sample", key_or_dict, value, append)
|
||||
|
||||
def add_environment_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("environment", key_or_dict, value, append)
|
||||
|
||||
def add_instrument_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("instruments", key_or_dict, value, append)
|
||||
|
||||
def add_dataset_info(self, key_or_dict, value=None, append=False):
|
||||
self._add_info("datasets", key_or_dict, value, append)
|
||||
|
||||
def _add_info(self, category, key_or_dict, value, append):
|
||||
"""Internal helper method to add information to a category."""
|
||||
if isinstance(key_or_dict, dict):
|
||||
self.metadata[category].update(key_or_dict)
|
||||
else:
|
||||
if key_or_dict in self.metadata[category]:
|
||||
if append:
|
||||
current_value = self.metadata[category][key_or_dict]
|
||||
|
||||
if isinstance(current_value, list):
|
||||
|
||||
if not isinstance(value, list):
|
||||
# Append the new value to the list
|
||||
self.metadata[category][key_or_dict].append(value)
|
||||
else:
|
||||
self.metadata[category][key_or_dict] = current_value + value
|
||||
|
||||
elif isinstance(current_value, str):
|
||||
# Append the new value as a comma-separated string
|
||||
self.metadata[category][key_or_dict] = current_value + ',' + str(value)
|
||||
else:
|
||||
# Handle other types (for completeness, usually not required)
|
||||
self.metadata[category][key_or_dict] = [current_value, value]
|
||||
else:
|
||||
self.metadata[category][key_or_dict] = value
|
||||
else:
|
||||
self.metadata[category][key_or_dict] = value
|
||||
|
||||
def get_metadata(self):
|
||||
return {
|
||||
"parent_files": self.parent_files,
|
||||
"metadata": self.metadata
|
||||
}
|
||||
|
||||
def print_metadata(self):
|
||||
print("parent_files", self.parent_files)
|
||||
|
||||
for key in self.metadata.keys():
|
||||
print(key,'metadata:\n')
|
||||
for item in self.metadata[key].items():
|
||||
print(item[0],item[1])
|
||||
|
||||
|
||||
|
||||
def clear_metadata(self):
|
||||
self.metadata = {
|
||||
"project": {},
|
||||
"sample": {},
|
||||
"environment": {},
|
||||
"instruments": {},
|
||||
"datasets": {}
|
||||
}
|
||||
self.parent_files = []
|
||||
|
||||
def main():
|
||||
|
||||
output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
|
||||
output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
|
||||
output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
|
||||
#output_yml_filename_path = hdf5_ops.serialize_metadata(output_filename_path)
|
||||
|
||||
#first_initialize_metadata_review(output_filename_path,initials='NG')
|
||||
#second_submit_metadata_review()
|
||||
#if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
|
||||
# third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
|
||||
#fourth_complete_metadata_review()
|
||||
|
1348
src/hdf5_ops.py
1348
src/hdf5_ops.py
File diff suppressed because it is too large
Load Diff
@ -1,396 +1,396 @@
|
||||
import sys
|
||||
import os
|
||||
root_dir = os.path.abspath(os.curdir)
|
||||
sys.path.append(root_dir)
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import h5py
|
||||
import logging
|
||||
|
||||
import utils.g5505_utils as utils
|
||||
import instruments.readers.filereader_registry as filereader_registry
|
||||
|
||||
|
||||
|
||||
def __transfer_file_dict_to_hdf5(h5file, group_name, file_dict):
|
||||
"""
|
||||
Transfers data from a file_dict to an HDF5 file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
h5file : h5py.File
|
||||
HDF5 file object where the data will be written.
|
||||
group_name : str
|
||||
Name of the HDF5 group where data will be stored.
|
||||
file_dict : dict
|
||||
Dictionary containing file data to be transferred. Required structure:
|
||||
{
|
||||
'name': str,
|
||||
'attributes_dict': dict,
|
||||
'datasets': [
|
||||
{
|
||||
'name': str,
|
||||
'data': array-like,
|
||||
'shape': tuple,
|
||||
'attributes': dict (optional)
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
|
||||
if not file_dict:
|
||||
return
|
||||
|
||||
try:
|
||||
# Create group and add their attributes
|
||||
filename = file_dict['name']
|
||||
group = h5file[group_name].create_group(name=filename)
|
||||
# Add group attributes
|
||||
group.attrs.update(file_dict['attributes_dict'])
|
||||
|
||||
# Add datasets to the just created group
|
||||
for dataset in file_dict['datasets']:
|
||||
dataset_obj = group.create_dataset(
|
||||
name=dataset['name'],
|
||||
data=dataset['data'],
|
||||
shape=dataset['shape']
|
||||
)
|
||||
|
||||
# Add dataset's attributes
|
||||
attributes = dataset.get('attributes', {})
|
||||
dataset_obj.attrs.update(attributes)
|
||||
group.attrs['last_update_date'] = utils.created_at().encode('utf-8')
|
||||
|
||||
stdout = f'Completed transfer for /{group_name}/{filename}'
|
||||
|
||||
except Exception as inst:
|
||||
stdout = inst
|
||||
logging.error('Failed to transfer data into HDF5: %s', inst)
|
||||
|
||||
return stdout
|
||||
|
||||
def __copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name, work_with_copy : bool = True):
|
||||
# Create copy of original file to avoid possible file corruption and work with it.
|
||||
|
||||
if work_with_copy:
|
||||
tmp_file_path = utils.make_file_copy(source_file_path)
|
||||
else:
|
||||
tmp_file_path = source_file_path
|
||||
|
||||
# Open backup h5 file and copy complet filesystem directory onto a group in h5file
|
||||
with h5py.File(tmp_file_path,'r') as src_file:
|
||||
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
|
||||
|
||||
if 'tmp_files' in tmp_file_path:
|
||||
os.remove(tmp_file_path)
|
||||
|
||||
stdout = f'Completed transfer for /{dest_group_name}'
|
||||
return stdout
|
||||
|
||||
def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
|
||||
path_to_filenames_dict: dict = None,
|
||||
select_dir_keywords : list = [],
|
||||
root_metadata_dict : dict = {}, mode = 'w'):
|
||||
|
||||
"""
|
||||
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
|
||||
of a given filesystem path.
|
||||
|
||||
The data integration capabilities are limited by our file reader, which can only access data from a list of
|
||||
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
|
||||
Files are formatted as composite objects consisting of a group, file, and attributes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_filename : str
|
||||
Name of the output HDF5 file.
|
||||
path_to_input_directory : str
|
||||
Path to root directory, specified with forward slashes, e.g., path/to/root.
|
||||
|
||||
path_to_filenames_dict : dict, optional
|
||||
A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
|
||||
If provided, 'input_file_system_path' is ignored.
|
||||
|
||||
select_dir_keywords : list
|
||||
List of string elements to consider or select only directory paths that contain
|
||||
a word in 'select_dir_keywords'. When empty, all directory paths are considered
|
||||
to be included in the HDF5 file group hierarchy.
|
||||
root_metadata_dict : dict
|
||||
Metadata to include at the root level of the HDF5 file.
|
||||
|
||||
mode : str
|
||||
'w' create File, truncate if it exists, or 'r+' read/write, File must exists. By default, mode = "w".
|
||||
|
||||
Returns
|
||||
-------
|
||||
output_filename : str
|
||||
Path to the created HDF5 file.
|
||||
"""
|
||||
|
||||
|
||||
if not mode in ['w','r+']:
|
||||
raise ValueError(f'Parameter mode must take values in ["w","r+"]')
|
||||
|
||||
if not '/' in path_to_input_directory:
|
||||
raise ValueError('path_to_input_directory needs to be specified using forward slashes "/".' )
|
||||
|
||||
#path_to_output_directory = os.path.join(path_to_input_directory,'..')
|
||||
path_to_input_directory = os.path.normpath(path_to_input_directory).rstrip(os.sep)
|
||||
|
||||
|
||||
for i, keyword in enumerate(select_dir_keywords):
|
||||
select_dir_keywords[i] = keyword.replace('/',os.sep)
|
||||
|
||||
if not path_to_filenames_dict:
|
||||
# On dry_run=True, returns path to files dictionary of the output directory without making a actual copy of the input directory.
|
||||
# Therefore, there wont be a copying conflict by setting up input and output directories the same
|
||||
path_to_filenames_dict = utils.copy_directory_with_contraints(input_dir_path=path_to_input_directory,
|
||||
output_dir_path=path_to_input_directory,
|
||||
dry_run=True)
|
||||
# Set input_directory as copied input directory
|
||||
root_dir = path_to_input_directory
|
||||
path_to_output_file = path_to_input_directory.rstrip(os.path.sep) + '.h5'
|
||||
|
||||
start_message = f'\n[Start] Data integration :\nSource: {path_to_input_directory}\nDestination: {path_to_output_file}\n'
|
||||
|
||||
print(start_message)
|
||||
logging.info(start_message)
|
||||
|
||||
# Check if the .h5 file already exists
|
||||
if os.path.exists(path_to_output_file) and mode in ['w']:
|
||||
message = (
|
||||
f"[Notice] The file '{path_to_output_file}' already exists and will not be overwritten.\n"
|
||||
"If you wish to replace it, please delete the existing file first and rerun the program."
|
||||
)
|
||||
print(message)
|
||||
logging.error(message)
|
||||
else:
|
||||
with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
|
||||
|
||||
number_of_dirs = len(path_to_filenames_dict.keys())
|
||||
dir_number = 1
|
||||
for dirpath, filtered_filenames_list in path_to_filenames_dict.items():
|
||||
|
||||
# Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict.
|
||||
if not filtered_filenames_list:
|
||||
continue
|
||||
|
||||
group_name = dirpath.replace(os.sep,'/')
|
||||
group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
|
||||
|
||||
# Flatten group name to one level
|
||||
if select_dir_keywords:
|
||||
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
|
||||
else:
|
||||
offset = 1
|
||||
tmp_list = group_name.split('/')
|
||||
if len(tmp_list) > offset+1:
|
||||
group_name = '/'.join([tmp_list[i] for i in range(offset+1)])
|
||||
|
||||
# Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes
|
||||
if not group_name in h5file.keys():
|
||||
h5file.create_group(group_name)
|
||||
h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8')
|
||||
#h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list))
|
||||
#h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list))
|
||||
#else:
|
||||
#print(group_name,' was already created.')
|
||||
instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}'
|
||||
print(instFoldermsgStart)
|
||||
|
||||
for filenumber, filename in enumerate(filtered_filenames_list):
|
||||
|
||||
#file_ext = os.path.splitext(filename)[1]
|
||||
#try:
|
||||
|
||||
# hdf5 path to filename group
|
||||
dest_group_name = f'{group_name}/{filename}'
|
||||
|
||||
if not 'h5' in filename:
|
||||
#file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
|
||||
#file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
|
||||
file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename))
|
||||
|
||||
stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict)
|
||||
|
||||
else:
|
||||
source_file_path = os.path.join(dirpath,filename)
|
||||
dest_file_obj = h5file
|
||||
#group_name +'/'+filename
|
||||
#ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name)
|
||||
#g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name)
|
||||
stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False)
|
||||
|
||||
# Update the progress bar and log the end message
|
||||
instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n'
|
||||
# Print and log the start message
|
||||
utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd)
|
||||
logging.info(instFoldermsdEnd )
|
||||
dir_number = dir_number + 1
|
||||
|
||||
print('[End] Data integration')
|
||||
logging.info('[End] Data integration')
|
||||
|
||||
if len(root_metadata_dict.keys())>0:
|
||||
for key, value in root_metadata_dict.items():
|
||||
#if key in h5file.attrs:
|
||||
# del h5file.attrs[key]
|
||||
h5file.attrs.create(key, value)
|
||||
#annotate_root_dir(output_filename,root_metadata_dict)
|
||||
|
||||
|
||||
#output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
|
||||
|
||||
return path_to_output_file #, output_yml_filename_path
|
||||
|
||||
def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None):
|
||||
"""
|
||||
Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
ofilename (str): Path for the output HDF5 file.
|
||||
input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path.
|
||||
group_by_funcs (list): List of callables or column names to define hierarchical grouping.
|
||||
approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file.
|
||||
extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
None
|
||||
"""
|
||||
# Check whether input_data is a valid file-system path or a DataFrame
|
||||
is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False
|
||||
|
||||
if is_valid_path(input_data):
|
||||
# If input_data is a file-system path, create a DataFrame with file info
|
||||
file_list = os.listdir(input_data)
|
||||
df = pd.DataFrame(file_list, columns=['filename'])
|
||||
df = utils.augment_with_filetype(df) # Add filetype information if needed
|
||||
elif isinstance(input_data, pd.DataFrame):
|
||||
# If input_data is a DataFrame, make a copy
|
||||
df = input_data.copy()
|
||||
else:
|
||||
raise ValueError("input_data must be either a valid file-system path or a DataFrame.")
|
||||
|
||||
# Generate grouping columns based on group_by_funcs
|
||||
if utils.is_callable_list(group_by_funcs):
|
||||
grouping_cols = []
|
||||
for i, func in enumerate(group_by_funcs):
|
||||
col_name = f'level_{i}_groups'
|
||||
grouping_cols.append(col_name)
|
||||
df[col_name] = func(df)
|
||||
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
|
||||
grouping_cols = group_by_funcs
|
||||
else:
|
||||
raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.")
|
||||
|
||||
# Generate group paths
|
||||
df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)]
|
||||
|
||||
# Open the HDF5 file in write mode
|
||||
with h5py.File(ofilename, 'w') as file:
|
||||
for group_path in df['group_path'].unique():
|
||||
# Create groups in HDF5
|
||||
group = file.create_group(group_path)
|
||||
|
||||
# Filter the DataFrame for the current group
|
||||
datatable = df[df['group_path'] == group_path].copy()
|
||||
|
||||
# Drop grouping columns and the generated 'group_path'
|
||||
datatable = datatable.drop(columns=grouping_cols + ['group_path'])
|
||||
|
||||
# Add datasets to groups if data exists
|
||||
if not datatable.empty:
|
||||
dataset = utils.convert_dataframe_to_np_structured_array(datatable)
|
||||
group.create_dataset(name='data_table', data=dataset)
|
||||
|
||||
# Add attributes if extract_attrs_func is provided
|
||||
if extract_attrs_func:
|
||||
attrs = extract_attrs_func(datatable)
|
||||
for key, value in attrs.items():
|
||||
group.attrs[key] = value
|
||||
|
||||
# Save metadata about depth of hierarchy
|
||||
file.attrs.create(name='depth', data=len(grouping_cols) - 1)
|
||||
|
||||
print(f"HDF5 file created successfully at {ofilename}")
|
||||
|
||||
return ofilename
|
||||
|
||||
|
||||
def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
|
||||
"""
|
||||
Save processed dataframe columns with annotations to an HDF5 file.
|
||||
|
||||
Parameters:
|
||||
df (pd.DataFrame): DataFrame containing processed time series.
|
||||
annotator (): Annotator object with get_metadata method.
|
||||
output_filename (str): Path to the source HDF5 file.
|
||||
"""
|
||||
# Convert datetime columns to string
|
||||
datetime_cols = df.select_dtypes(include=['datetime64']).columns
|
||||
|
||||
if list(datetime_cols):
|
||||
df[datetime_cols] = df[datetime_cols].map(str)
|
||||
|
||||
# Convert dataframe to structured array
|
||||
icad_data_table = utils.convert_dataframe_to_np_structured_array(df)
|
||||
|
||||
# Get metadata
|
||||
metadata_dict = annotator.get_metadata()
|
||||
|
||||
# Prepare project level attributes to be added at the root level
|
||||
|
||||
project_level_attributes = metadata_dict['metadata']['project']
|
||||
|
||||
# Prepare high-level attributes
|
||||
high_level_attributes = {
|
||||
'parent_files': metadata_dict['parent_files'],
|
||||
**metadata_dict['metadata']['sample'],
|
||||
**metadata_dict['metadata']['environment'],
|
||||
**metadata_dict['metadata']['instruments']
|
||||
}
|
||||
|
||||
# Prepare data level attributes
|
||||
data_level_attributes = metadata_dict['metadata']['datasets']
|
||||
|
||||
for key, value in data_level_attributes.items():
|
||||
if isinstance(value,dict):
|
||||
data_level_attributes[key] = utils.convert_attrdict_to_np_structured_array(value)
|
||||
|
||||
|
||||
# Prepare file dictionary
|
||||
file_dict = {
|
||||
'name': project_level_attributes['processing_file'],
|
||||
'attributes_dict': high_level_attributes,
|
||||
'datasets': [{
|
||||
'name': "data_table",
|
||||
'data': icad_data_table,
|
||||
'shape': icad_data_table.shape,
|
||||
'attributes': data_level_attributes
|
||||
}]
|
||||
}
|
||||
|
||||
# Check if the file exists
|
||||
if os.path.exists(output_filename):
|
||||
mode = "a"
|
||||
print(f"File {output_filename} exists. Opening in append mode.")
|
||||
else:
|
||||
mode = "w"
|
||||
print(f"File {output_filename} does not exist. Creating a new file.")
|
||||
|
||||
|
||||
# Write to HDF5
|
||||
with h5py.File(output_filename, mode) as h5file:
|
||||
# Add project level attributes at the root/top level
|
||||
h5file.attrs.update(project_level_attributes)
|
||||
__transfer_file_dict_to_hdf5(h5file, '/', file_dict)
|
||||
|
||||
#if __name__ == '__main__':
|
||||
import sys
|
||||
import os
|
||||
root_dir = os.path.abspath(os.curdir)
|
||||
sys.path.append(root_dir)
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import h5py
|
||||
import logging
|
||||
|
||||
import utils.g5505_utils as utils
|
||||
import instruments.readers.filereader_registry as filereader_registry
|
||||
|
||||
|
||||
|
||||
def __transfer_file_dict_to_hdf5(h5file, group_name, file_dict):
|
||||
"""
|
||||
Transfers data from a file_dict to an HDF5 file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
h5file : h5py.File
|
||||
HDF5 file object where the data will be written.
|
||||
group_name : str
|
||||
Name of the HDF5 group where data will be stored.
|
||||
file_dict : dict
|
||||
Dictionary containing file data to be transferred. Required structure:
|
||||
{
|
||||
'name': str,
|
||||
'attributes_dict': dict,
|
||||
'datasets': [
|
||||
{
|
||||
'name': str,
|
||||
'data': array-like,
|
||||
'shape': tuple,
|
||||
'attributes': dict (optional)
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
|
||||
if not file_dict:
|
||||
return
|
||||
|
||||
try:
|
||||
# Create group and add their attributes
|
||||
filename = file_dict['name']
|
||||
group = h5file[group_name].create_group(name=filename)
|
||||
# Add group attributes
|
||||
group.attrs.update(file_dict['attributes_dict'])
|
||||
|
||||
# Add datasets to the just created group
|
||||
for dataset in file_dict['datasets']:
|
||||
dataset_obj = group.create_dataset(
|
||||
name=dataset['name'],
|
||||
data=dataset['data'],
|
||||
shape=dataset['shape']
|
||||
)
|
||||
|
||||
# Add dataset's attributes
|
||||
attributes = dataset.get('attributes', {})
|
||||
dataset_obj.attrs.update(attributes)
|
||||
group.attrs['last_update_date'] = utils.created_at().encode('utf-8')
|
||||
|
||||
stdout = f'Completed transfer for /{group_name}/{filename}'
|
||||
|
||||
except Exception as inst:
|
||||
stdout = inst
|
||||
logging.error('Failed to transfer data into HDF5: %s', inst)
|
||||
|
||||
return stdout
|
||||
|
||||
def __copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name, work_with_copy : bool = True):
|
||||
# Create copy of original file to avoid possible file corruption and work with it.
|
||||
|
||||
if work_with_copy:
|
||||
tmp_file_path = utils.make_file_copy(source_file_path)
|
||||
else:
|
||||
tmp_file_path = source_file_path
|
||||
|
||||
# Open backup h5 file and copy complet filesystem directory onto a group in h5file
|
||||
with h5py.File(tmp_file_path,'r') as src_file:
|
||||
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
|
||||
|
||||
if 'tmp_files' in tmp_file_path:
|
||||
os.remove(tmp_file_path)
|
||||
|
||||
stdout = f'Completed transfer for /{dest_group_name}'
|
||||
return stdout
|
||||
|
||||
def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
|
||||
path_to_filenames_dict: dict = None,
|
||||
select_dir_keywords : list = [],
|
||||
root_metadata_dict : dict = {}, mode = 'w'):
|
||||
|
||||
"""
|
||||
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
|
||||
of a given filesystem path.
|
||||
|
||||
The data integration capabilities are limited by our file reader, which can only access data from a list of
|
||||
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
|
||||
Files are formatted as composite objects consisting of a group, file, and attributes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_filename : str
|
||||
Name of the output HDF5 file.
|
||||
path_to_input_directory : str
|
||||
Path to root directory, specified with forward slashes, e.g., path/to/root.
|
||||
|
||||
path_to_filenames_dict : dict, optional
|
||||
A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
|
||||
If provided, 'input_file_system_path' is ignored.
|
||||
|
||||
select_dir_keywords : list
|
||||
List of string elements to consider or select only directory paths that contain
|
||||
a word in 'select_dir_keywords'. When empty, all directory paths are considered
|
||||
to be included in the HDF5 file group hierarchy.
|
||||
root_metadata_dict : dict
|
||||
Metadata to include at the root level of the HDF5 file.
|
||||
|
||||
mode : str
|
||||
'w' create File, truncate if it exists, or 'r+' read/write, File must exists. By default, mode = "w".
|
||||
|
||||
Returns
|
||||
-------
|
||||
output_filename : str
|
||||
Path to the created HDF5 file.
|
||||
"""
|
||||
|
||||
|
||||
if not mode in ['w','r+']:
|
||||
raise ValueError(f'Parameter mode must take values in ["w","r+"]')
|
||||
|
||||
if not '/' in path_to_input_directory:
|
||||
raise ValueError('path_to_input_directory needs to be specified using forward slashes "/".' )
|
||||
|
||||
#path_to_output_directory = os.path.join(path_to_input_directory,'..')
|
||||
path_to_input_directory = os.path.normpath(path_to_input_directory).rstrip(os.sep)
|
||||
|
||||
|
||||
for i, keyword in enumerate(select_dir_keywords):
|
||||
select_dir_keywords[i] = keyword.replace('/',os.sep)
|
||||
|
||||
if not path_to_filenames_dict:
|
||||
# On dry_run=True, returns path to files dictionary of the output directory without making a actual copy of the input directory.
|
||||
# Therefore, there wont be a copying conflict by setting up input and output directories the same
|
||||
path_to_filenames_dict = utils.copy_directory_with_contraints(input_dir_path=path_to_input_directory,
|
||||
output_dir_path=path_to_input_directory,
|
||||
dry_run=True)
|
||||
# Set input_directory as copied input directory
|
||||
root_dir = path_to_input_directory
|
||||
path_to_output_file = path_to_input_directory.rstrip(os.path.sep) + '.h5'
|
||||
|
||||
start_message = f'\n[Start] Data integration :\nSource: {path_to_input_directory}\nDestination: {path_to_output_file}\n'
|
||||
|
||||
print(start_message)
|
||||
logging.info(start_message)
|
||||
|
||||
# Check if the .h5 file already exists
|
||||
if os.path.exists(path_to_output_file) and mode in ['w']:
|
||||
message = (
|
||||
f"[Notice] The file '{path_to_output_file}' already exists and will not be overwritten.\n"
|
||||
"If you wish to replace it, please delete the existing file first and rerun the program."
|
||||
)
|
||||
print(message)
|
||||
logging.error(message)
|
||||
else:
|
||||
with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
|
||||
|
||||
number_of_dirs = len(path_to_filenames_dict.keys())
|
||||
dir_number = 1
|
||||
for dirpath, filtered_filenames_list in path_to_filenames_dict.items():
|
||||
|
||||
# Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict.
|
||||
if not filtered_filenames_list:
|
||||
continue
|
||||
|
||||
group_name = dirpath.replace(os.sep,'/')
|
||||
group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
|
||||
|
||||
# Flatten group name to one level
|
||||
if select_dir_keywords:
|
||||
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
|
||||
else:
|
||||
offset = 1
|
||||
tmp_list = group_name.split('/')
|
||||
if len(tmp_list) > offset+1:
|
||||
group_name = '/'.join([tmp_list[i] for i in range(offset+1)])
|
||||
|
||||
# Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes
|
||||
if not group_name in h5file.keys():
|
||||
h5file.create_group(group_name)
|
||||
h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8')
|
||||
#h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list))
|
||||
#h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list))
|
||||
#else:
|
||||
#print(group_name,' was already created.')
|
||||
instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}'
|
||||
print(instFoldermsgStart)
|
||||
|
||||
for filenumber, filename in enumerate(filtered_filenames_list):
|
||||
|
||||
#file_ext = os.path.splitext(filename)[1]
|
||||
#try:
|
||||
|
||||
# hdf5 path to filename group
|
||||
dest_group_name = f'{group_name}/{filename}'
|
||||
|
||||
if not 'h5' in filename:
|
||||
#file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
|
||||
#file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
|
||||
file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename))
|
||||
|
||||
stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict)
|
||||
|
||||
else:
|
||||
source_file_path = os.path.join(dirpath,filename)
|
||||
dest_file_obj = h5file
|
||||
#group_name +'/'+filename
|
||||
#ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name)
|
||||
#g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name)
|
||||
stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False)
|
||||
|
||||
# Update the progress bar and log the end message
|
||||
instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n'
|
||||
# Print and log the start message
|
||||
utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd)
|
||||
logging.info(instFoldermsdEnd )
|
||||
dir_number = dir_number + 1
|
||||
|
||||
print('[End] Data integration')
|
||||
logging.info('[End] Data integration')
|
||||
|
||||
if len(root_metadata_dict.keys())>0:
|
||||
for key, value in root_metadata_dict.items():
|
||||
#if key in h5file.attrs:
|
||||
# del h5file.attrs[key]
|
||||
h5file.attrs.create(key, value)
|
||||
#annotate_root_dir(output_filename,root_metadata_dict)
|
||||
|
||||
|
||||
#output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
|
||||
|
||||
return path_to_output_file #, output_yml_filename_path
|
||||
|
||||
def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None):
|
||||
"""
|
||||
Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
ofilename (str): Path for the output HDF5 file.
|
||||
input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path.
|
||||
group_by_funcs (list): List of callables or column names to define hierarchical grouping.
|
||||
approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file.
|
||||
extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
None
|
||||
"""
|
||||
# Check whether input_data is a valid file-system path or a DataFrame
|
||||
is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False
|
||||
|
||||
if is_valid_path(input_data):
|
||||
# If input_data is a file-system path, create a DataFrame with file info
|
||||
file_list = os.listdir(input_data)
|
||||
df = pd.DataFrame(file_list, columns=['filename'])
|
||||
df = utils.augment_with_filetype(df) # Add filetype information if needed
|
||||
elif isinstance(input_data, pd.DataFrame):
|
||||
# If input_data is a DataFrame, make a copy
|
||||
df = input_data.copy()
|
||||
else:
|
||||
raise ValueError("input_data must be either a valid file-system path or a DataFrame.")
|
||||
|
||||
# Generate grouping columns based on group_by_funcs
|
||||
if utils.is_callable_list(group_by_funcs):
|
||||
grouping_cols = []
|
||||
for i, func in enumerate(group_by_funcs):
|
||||
col_name = f'level_{i}_groups'
|
||||
grouping_cols.append(col_name)
|
||||
df[col_name] = func(df)
|
||||
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
|
||||
grouping_cols = group_by_funcs
|
||||
else:
|
||||
raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.")
|
||||
|
||||
# Generate group paths
|
||||
df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)]
|
||||
|
||||
# Open the HDF5 file in write mode
|
||||
with h5py.File(ofilename, 'w') as file:
|
||||
for group_path in df['group_path'].unique():
|
||||
# Create groups in HDF5
|
||||
group = file.create_group(group_path)
|
||||
|
||||
# Filter the DataFrame for the current group
|
||||
datatable = df[df['group_path'] == group_path].copy()
|
||||
|
||||
# Drop grouping columns and the generated 'group_path'
|
||||
datatable = datatable.drop(columns=grouping_cols + ['group_path'])
|
||||
|
||||
# Add datasets to groups if data exists
|
||||
if not datatable.empty:
|
||||
dataset = utils.convert_dataframe_to_np_structured_array(datatable)
|
||||
group.create_dataset(name='data_table', data=dataset)
|
||||
|
||||
# Add attributes if extract_attrs_func is provided
|
||||
if extract_attrs_func:
|
||||
attrs = extract_attrs_func(datatable)
|
||||
for key, value in attrs.items():
|
||||
group.attrs[key] = value
|
||||
|
||||
# Save metadata about depth of hierarchy
|
||||
file.attrs.create(name='depth', data=len(grouping_cols) - 1)
|
||||
|
||||
print(f"HDF5 file created successfully at {ofilename}")
|
||||
|
||||
return ofilename
|
||||
|
||||
|
||||
def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
|
||||
"""
|
||||
Save processed dataframe columns with annotations to an HDF5 file.
|
||||
|
||||
Parameters:
|
||||
df (pd.DataFrame): DataFrame containing processed time series.
|
||||
annotator (): Annotator object with get_metadata method.
|
||||
output_filename (str): Path to the source HDF5 file.
|
||||
"""
|
||||
# Convert datetime columns to string
|
||||
datetime_cols = df.select_dtypes(include=['datetime64']).columns
|
||||
|
||||
if list(datetime_cols):
|
||||
df[datetime_cols] = df[datetime_cols].map(str)
|
||||
|
||||
# Convert dataframe to structured array
|
||||
icad_data_table = utils.convert_dataframe_to_np_structured_array(df)
|
||||
|
||||
# Get metadata
|
||||
metadata_dict = annotator.get_metadata()
|
||||
|
||||
# Prepare project level attributes to be added at the root level
|
||||
|
||||
project_level_attributes = metadata_dict['metadata']['project']
|
||||
|
||||
# Prepare high-level attributes
|
||||
high_level_attributes = {
|
||||
'parent_files': metadata_dict['parent_files'],
|
||||
**metadata_dict['metadata']['sample'],
|
||||
**metadata_dict['metadata']['environment'],
|
||||
**metadata_dict['metadata']['instruments']
|
||||
}
|
||||
|
||||
# Prepare data level attributes
|
||||
data_level_attributes = metadata_dict['metadata']['datasets']
|
||||
|
||||
for key, value in data_level_attributes.items():
|
||||
if isinstance(value,dict):
|
||||
data_level_attributes[key] = utils.convert_attrdict_to_np_structured_array(value)
|
||||
|
||||
|
||||
# Prepare file dictionary
|
||||
file_dict = {
|
||||
'name': project_level_attributes['processing_file'],
|
||||
'attributes_dict': high_level_attributes,
|
||||
'datasets': [{
|
||||
'name': "data_table",
|
||||
'data': icad_data_table,
|
||||
'shape': icad_data_table.shape,
|
||||
'attributes': data_level_attributes
|
||||
}]
|
||||
}
|
||||
|
||||
# Check if the file exists
|
||||
if os.path.exists(output_filename):
|
||||
mode = "a"
|
||||
print(f"File {output_filename} exists. Opening in append mode.")
|
||||
else:
|
||||
mode = "w"
|
||||
print(f"File {output_filename} does not exist. Creating a new file.")
|
||||
|
||||
|
||||
# Write to HDF5
|
||||
with h5py.File(output_filename, mode) as h5file:
|
||||
# Add project level attributes at the root/top level
|
||||
h5file.attrs.update(project_level_attributes)
|
||||
__transfer_file_dict_to_hdf5(h5file, '/', file_dict)
|
||||
|
||||
#if __name__ == '__main__':
|
||||
|
@ -1,270 +1,270 @@
|
||||
import pandas as pd
|
||||
import logging
|
||||
import os
|
||||
import datetime
|
||||
from pybis import Openbis
|
||||
import hidden
|
||||
|
||||
admissible_props_list = ['$name', 'filenumber', 'default_experiment.experimental_results',
|
||||
'dataquality', '$xmlcomments', '$annotations_state',
|
||||
'sample_name', 'position_x', 'position_y', 'position_z', 'temp', 'cell_pressure', 'gas_flow_setting', 'sample_notes',
|
||||
'beamline', 'photon_energy', 'slit_entrance_v', 'slit_exit_v', 'izero',
|
||||
'slit_exit_h', 'hos', 'cone', 'endstation', 'hof',
|
||||
'method_name', 'region', 'lens_mode', 'acq_mode', 'dwell_time', 'frames', 'passenergy',
|
||||
'iterations', 'sequenceiterations', 'ke_range_center', 'ke_step']
|
||||
|
||||
|
||||
def initialize_openbis_obj():
|
||||
|
||||
# TODO: implement a more secure authentication method.
|
||||
openbis_obj = Openbis('https://openbis-psi.labnotebook.ch/openbis/webapp/eln-lims/?menuUniqueId=null&viewName=showBlancPage&viewData=null', verify_certificates=False)
|
||||
openbis_obj.login(hidden.username,hidden.password)
|
||||
|
||||
return openbis_obj
|
||||
|
||||
def align_datetime_observation_windows(df_h5: pd.DataFrame, df_openbis: pd.DataFrame, h5_datetime_var: str = 'lastModifiedDatestr', ob_datetime_var: str = 'registrationDate') -> pd.DataFrame:
|
||||
|
||||
""" returns filtered/reduced versions of 'df' and 'df_ref' with aligned datetime observation windows.
|
||||
That is, the datetime variable range is the same for the returned dataframes."""
|
||||
#""returns a filtered or reduced version of 'df' by removing all rows that are outside the datetime variable overlapping region between 'df' and 'df_ref'.
|
||||
#"""
|
||||
|
||||
#df_h5['lastModifiedDatestr'] = df_h5['lastModifiedDatestr'].astype('datetime64[ns]')
|
||||
#df_h5 = df_h5.sort_values(by='lastModifiedDatestr')
|
||||
|
||||
if not (h5_datetime_var in df_h5.columns.to_list() and ob_datetime_var in df_openbis.columns.to_list()):
|
||||
#TODO: Check if ValueError is the best type of error to raise here
|
||||
raise ValueError("Dataframes 'df' and 'df_ref' must contain columns 'datetime_var' and 'datetime_var_ref', storing values in suitable datetime string format (e.g., yyyy-mm-dd hh:mm:ss).")
|
||||
|
||||
df_h5[h5_datetime_var] = df_h5[h5_datetime_var].astype('datetime64[ns]')
|
||||
df_openbis[ob_datetime_var] = df_openbis[ob_datetime_var].astype('datetime64[ns]')
|
||||
|
||||
min_timestamp = max([df_openbis[ob_datetime_var].min(), df_h5[h5_datetime_var].min()])
|
||||
max_timestamp = min([df_openbis[ob_datetime_var].max(), df_h5[h5_datetime_var].max()])
|
||||
|
||||
# Determine overlap between df and df_ref, and filters out all rows from df with datetime variable outside the overlapping datetime region.
|
||||
datetime_overlap_indicator = (df_h5[h5_datetime_var] >= min_timestamp) & (df_h5[h5_datetime_var] <= max_timestamp)
|
||||
df_h5 = df_h5.loc[datetime_overlap_indicator,:]
|
||||
|
||||
datetime_overlap_indicator = (df_openbis[ob_datetime_var] >= min_timestamp) & (df_openbis[ob_datetime_var] <= max_timestamp)
|
||||
df_openbis = df_openbis.loc[datetime_overlap_indicator,:]
|
||||
|
||||
df_h5 = df_h5.sort_values(by=h5_datetime_var)
|
||||
df_openbis = df_openbis.sort_values(by=ob_datetime_var)
|
||||
|
||||
return df_h5, df_openbis
|
||||
|
||||
def reformat_openbis_dataframe_filenumber(df_openbis):
|
||||
|
||||
if not 'FILENUMBER' in df_openbis.columns:
|
||||
raise ValueError('df_openbis does not contain the column "FILENUMBER". Make sure you query (e.g., o.get_samples(props=["filenumbe"])) that before creating df_openbis.')
|
||||
#if not 'name' in df.columns:
|
||||
# raise ValueError("df does not contain the column 'name'. Ensure df complies with Throsten's Table's format.")
|
||||
|
||||
# Augment df_openbis with 'name' column consitent with Thorsten's naming convention
|
||||
name_list = ['0' + item.zfill(3) + item.zfill(3) for item in df_openbis['FILENUMBER']]
|
||||
df_openbis['REFORMATED_FILENUMBER'] = pd.Series(name_list, index=df_openbis.index)
|
||||
|
||||
return df_openbis
|
||||
|
||||
def pair_openbis_and_h5_dataframes(df_openbis, df_h5, pairing_ob_var: str, pairing_h5_var: str):
|
||||
|
||||
""" Pairs every row (or openbis sample) in 'df_openbis' with a set of rows (or measurements) in 'df_h5' by matching the i-th row in 'df_h5'
|
||||
with the rows of 'df_h5' that satisfy the string df_openbis.loc[i,pairing_var_1] is contained in the string df_h5[i,pairing_var_2]
|
||||
|
||||
Example: pairing_var_1, pairing_var_2 = reformated 'REFORMATED_FILENUMBER', 'name'
|
||||
|
||||
"""
|
||||
# Reformat openbis dataframe filenumber so that it can be used to find associated measurements in h5 dataframe
|
||||
df_openbis = reformat_openbis_dataframe_filenumber(df_openbis)
|
||||
|
||||
related_indices_list = []
|
||||
for sample_idx in df_openbis.index:
|
||||
sample_value = df_openbis.loc[sample_idx,pairing_ob_var]
|
||||
tmp_list = [sample_value in item[0:item.find('_')] for item in df_h5[pairing_h5_var]]
|
||||
related_indices_list.append(df_h5.index[tmp_list])
|
||||
|
||||
print('Paring openbis sample: ' + df_openbis.loc[sample_idx,pairing_ob_var])
|
||||
print('with reformated FILENUMBER: ' + sample_value)
|
||||
print('to following measurements in h5 dataframe:')
|
||||
print(df_h5.loc[df_h5.index[tmp_list],'name'])
|
||||
print('\n')
|
||||
|
||||
df_openbis['related_h5_indices'] = pd.Series(related_indices_list, index=df_openbis.index)
|
||||
|
||||
return df_openbis
|
||||
|
||||
|
||||
def range_cols_2_string(df,lb_var,ub_var):
|
||||
|
||||
if not sum(df.loc[:,ub_var]-df.loc[:,lb_var])==0:
|
||||
#tmp_list = ['-'.join([str(round(df.loc[i,lb_var],2)),str(round(df.loc[i,ub_var],1))]) for i in df.index]
|
||||
tmp_list = ['-'.join(["{:.1f}".format(df.loc[i,lb_var]),"{:.1f}".format(df.loc[i,ub_var])]) for i in df.index]
|
||||
elif len(df.loc[:,lb_var].unique())>1: # check if values are different
|
||||
#tmp_list = [str(round(df.loc[i,lb_var],2)) for i in df.index]
|
||||
tmp_list = ["{:.1f}".format(df.loc[i,lb_var]) for i in df.index]
|
||||
else:
|
||||
#tmp_list = [str(round(df.loc[0,lb_var],2))]
|
||||
tmp_list = ["{:.1f}".format(df.loc[0,lb_var])]
|
||||
return '/'.join(tmp_list)
|
||||
|
||||
def col_2_string(df,column_var):
|
||||
|
||||
if not column_var in df.columns:
|
||||
raise ValueError("'column var must belong in df.columns")
|
||||
|
||||
#tmp_list = [str(round(item,1)) for item in df[column_var]]
|
||||
tmp_list = ["{:.2f}".format(item) for item in df[column_var]]
|
||||
if len(df[column_var].unique())==1:
|
||||
tmp_list = [tmp_list[0]]
|
||||
|
||||
return '/'.join(tmp_list)
|
||||
|
||||
|
||||
def compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx):
|
||||
|
||||
prop2attr = {'sample_name':'sample', # ask Throsten whether this assignment is correct or not
|
||||
'position_x':'smplX_mm',
|
||||
'position_y':'smplY_mm',
|
||||
'position_z':'smplZ_mm',
|
||||
'temp':'sampleTemp_dC',
|
||||
'cell_pressure':'cellPressure_mbar',
|
||||
#'gas_flow_setting': '',
|
||||
'method_name':'regionName', # measurement type: XPS or NEXAFS
|
||||
'region':'regionName', # VB/N1s/C1s
|
||||
'passenergy':'regionName', # REAL
|
||||
|
||||
'photon_energy':'xRayEkinRange_eV',
|
||||
'dwell_time':'scientaDwellTime_ms',
|
||||
'acq_mode':'scientaAcquisitionMode',
|
||||
'ke_range_center':'scientaEkinRange_eV',
|
||||
'ke_step':'scientaEkinStep_eV',
|
||||
'lens_mode':'scientaLensMode'
|
||||
}
|
||||
|
||||
sample_identifier = df_openbis.loc[sample_idx,'identifier']
|
||||
props_dict = {'FILENUMBER' : df_openbis.loc[sample_idx,'FILENUMBER']}
|
||||
|
||||
#props_dict = {}
|
||||
|
||||
if not len(df_openbis.loc[sample_idx,'related_h5_indices']):
|
||||
props_dict['identifier'] = sample_identifier
|
||||
return props_dict
|
||||
|
||||
reduced_df_h5 = df_h5.loc[df_openbis.loc[sample_idx,'related_h5_indices'],:]
|
||||
reduced_df_h5 = reduced_df_h5.reset_index()
|
||||
|
||||
# include related_samples key for validation purposes. Related samples are used to compute average and/or combined openbis properties.
|
||||
related_sample_list = [reduced_df_h5['name'][index] for index in reduced_df_h5['name'].index]
|
||||
related_samples = ' / '.join(related_sample_list)
|
||||
props_dict['Subject_samples'] = related_samples
|
||||
|
||||
props_dict['sample_name'] = reduced_df_h5['sample'].unique()[0] if len(reduced_df_h5['sample'].unique())==1 else '/'.join(reduced_df_h5['sample'].tolist())
|
||||
|
||||
if not 'NEXAFS' in reduced_df_h5['regionName'].iloc[0]:
|
||||
props_dict['identifier'] = sample_identifier
|
||||
props_dict['method_name'] = 'XPS'
|
||||
for item_idx in reduced_df_h5.index:
|
||||
item = reduced_df_h5.loc[item_idx,'regionName']
|
||||
if item_idx > 0:
|
||||
props_dict['region'] = props_dict['region'] + '/' + item[0:item.find('_')]
|
||||
#props_dict['dwell_time'] = props_dict['dwell_time'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
|
||||
#props_dict['ke_range_center'] = props_dict['ke_range_center'] + '/' + str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
|
||||
#props_dict['ke_step_center'] = props_dict['ke_step_center'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
|
||||
#props_dict['passenergy'].append(float(item[item.find('_')+1:item.find('eV')]))
|
||||
else:
|
||||
props_dict['region'] = item[0:item.find('_')]
|
||||
#props_dict['dwell_time'] = str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
|
||||
#props_dict['ke_range_center'] = str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
|
||||
#props_dict['ke_step_center'] = str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
|
||||
|
||||
#props_dict['passenergy'] = reduced_df_h5.loc[:,'scientaPassEnergy_eV'].min()
|
||||
|
||||
else:
|
||||
props_dict = {'identifier':sample_identifier,'method_name':'NEXAFS'}
|
||||
|
||||
|
||||
#props_dict['temp'] = round(reduced_df_h5['sampleTemp_dC'].mean(),2)
|
||||
#props_dict['cell_pressure'] = round(reduced_df_h5['cellPressure_mbar'].mean(),2)
|
||||
props_dict['temp'] = "{:.2f}".format(reduced_df_h5['sampleTemp_dC'].mean())
|
||||
props_dict['cell_pressure'] = "{:.2f}".format(reduced_df_h5['cellPressure_mbar'].mean())
|
||||
|
||||
reduced_df_h5['scientaDwellTime_ms'] = reduced_df_h5['scientaDwellTime_ms']*1e-3 # covert ms to seconds
|
||||
props_dict['dwell_time'] = col_2_string(reduced_df_h5,'scientaDwellTime_ms')
|
||||
props_dict['passenergy'] = col_2_string(reduced_df_h5,'scientaPassEnergy_eV')
|
||||
props_dict['ke_step_center'] = col_2_string(reduced_df_h5,'scientaEkinStep_eV')
|
||||
#props_dict['photon_energy'] =round(reduced_df_h5[['xRayEkinRange_eV_1','xRayEkinRange_eV_2']].mean(axis=1)[0],2)
|
||||
props_dict['photon_energy'] = range_cols_2_string(reduced_df_h5,'xRayEkinRange_eV_1','xRayEkinRange_eV_2')
|
||||
props_dict['ke_range_center'] = range_cols_2_string(reduced_df_h5,'scientaEkinRange_eV_1','scientaEkinRange_eV_2')
|
||||
|
||||
props_dict['lens_mode'] = reduced_df_h5['scientaLensMode'][0]
|
||||
props_dict['acq_mode'] = reduced_df_h5['scientaAcquisitionMode'][0]
|
||||
|
||||
props_dict['position_x'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplX_mm'].mean()) # round(reduced_df_h5.loc[:,'smplX_mm'].mean(),2)
|
||||
props_dict['position_y'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplY_mm'].mean())
|
||||
props_dict['position_z'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplZ_mm'].mean())
|
||||
|
||||
|
||||
|
||||
return props_dict
|
||||
|
||||
|
||||
|
||||
def single_sample_update(sample_props_dict,sample_collection,props_include_list):
|
||||
|
||||
""" Updates sample in openbis database specified in sample_props_dict, which must belong in sample_collection (i.e., result of openbis_obj.get_samples(...)) """
|
||||
|
||||
try:
|
||||
sample_path_identifier = sample_props_dict['identifier'] #path-like index
|
||||
sample = sample_collection[sample_path_identifier]
|
||||
for prop in sample_props_dict.keys():
|
||||
if (prop in admissible_props_list) and (prop in props_include_list):
|
||||
sample.props[prop] = sample_props_dict[prop]
|
||||
sample.save()
|
||||
except Exception:
|
||||
logging.error(Exception)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def sample_batch_update(openbis_obj,sample_collection,df_openbis,df_h5,props_include_list):
|
||||
|
||||
""" See """
|
||||
|
||||
if not 'related_h5_indices' in df_openbis.columns:
|
||||
raise ValueError("Input dataframe 'df_openbis' must contain a column named 'related_h5_indeces', resulting from suitable proprocessing steps.")
|
||||
|
||||
# TODO: as a safeguard, create exclude list containing properties that must not be changed
|
||||
exclude_list = ['filenumber','FILENUMBER','identifier']
|
||||
for item in props_include_list:
|
||||
if item in exclude_list:
|
||||
props_include_list.remove(item)
|
||||
|
||||
trans = openbis_obj.new_transaction()
|
||||
for sample_idx in len(range(df_openbis['identifier'])):
|
||||
|
||||
props_dict = compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx)
|
||||
sample_path_identifier = props_dict['identifier'] #path-like index
|
||||
sample = sample_collection[sample_path_identifier]
|
||||
|
||||
for prop in props_dict.keys():
|
||||
if prop in props_include_list:
|
||||
sample.props[prop] = props_dict[prop]
|
||||
|
||||
trans.add(sample)
|
||||
|
||||
trans.commit()
|
||||
|
||||
return 0
|
||||
|
||||
def conduct_dataframe_preprocessing_steps(df_h5, df_openbis):
|
||||
|
||||
if not 'lastModifiedDatestr'in df_h5.columns:
|
||||
raise ValueError('')
|
||||
|
||||
df_h5, df_openbis = align_datetime_observation_windows(df_h5, df_openbis, 'lastModifiedDatestr' , 'registrationDate')
|
||||
df_openbis = pair_openbis_and_h5_dataframes(df_openbis, df_h5, 'REFORMATED_FILENUMBER', 'name')
|
||||
|
||||
return df_h5, df_openbis
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import logging
|
||||
import os
|
||||
import datetime
|
||||
from pybis import Openbis
|
||||
import hidden
|
||||
|
||||
admissible_props_list = ['$name', 'filenumber', 'default_experiment.experimental_results',
|
||||
'dataquality', '$xmlcomments', '$annotations_state',
|
||||
'sample_name', 'position_x', 'position_y', 'position_z', 'temp', 'cell_pressure', 'gas_flow_setting', 'sample_notes',
|
||||
'beamline', 'photon_energy', 'slit_entrance_v', 'slit_exit_v', 'izero',
|
||||
'slit_exit_h', 'hos', 'cone', 'endstation', 'hof',
|
||||
'method_name', 'region', 'lens_mode', 'acq_mode', 'dwell_time', 'frames', 'passenergy',
|
||||
'iterations', 'sequenceiterations', 'ke_range_center', 'ke_step']
|
||||
|
||||
|
||||
def initialize_openbis_obj():
|
||||
|
||||
# TODO: implement a more secure authentication method.
|
||||
openbis_obj = Openbis('https://openbis-psi.labnotebook.ch/openbis/webapp/eln-lims/?menuUniqueId=null&viewName=showBlancPage&viewData=null', verify_certificates=False)
|
||||
openbis_obj.login(hidden.username,hidden.password)
|
||||
|
||||
return openbis_obj
|
||||
|
||||
def align_datetime_observation_windows(df_h5: pd.DataFrame, df_openbis: pd.DataFrame, h5_datetime_var: str = 'lastModifiedDatestr', ob_datetime_var: str = 'registrationDate') -> pd.DataFrame:
|
||||
|
||||
""" returns filtered/reduced versions of 'df' and 'df_ref' with aligned datetime observation windows.
|
||||
That is, the datetime variable range is the same for the returned dataframes."""
|
||||
#""returns a filtered or reduced version of 'df' by removing all rows that are outside the datetime variable overlapping region between 'df' and 'df_ref'.
|
||||
#"""
|
||||
|
||||
#df_h5['lastModifiedDatestr'] = df_h5['lastModifiedDatestr'].astype('datetime64[ns]')
|
||||
#df_h5 = df_h5.sort_values(by='lastModifiedDatestr')
|
||||
|
||||
if not (h5_datetime_var in df_h5.columns.to_list() and ob_datetime_var in df_openbis.columns.to_list()):
|
||||
#TODO: Check if ValueError is the best type of error to raise here
|
||||
raise ValueError("Dataframes 'df' and 'df_ref' must contain columns 'datetime_var' and 'datetime_var_ref', storing values in suitable datetime string format (e.g., yyyy-mm-dd hh:mm:ss).")
|
||||
|
||||
df_h5[h5_datetime_var] = df_h5[h5_datetime_var].astype('datetime64[ns]')
|
||||
df_openbis[ob_datetime_var] = df_openbis[ob_datetime_var].astype('datetime64[ns]')
|
||||
|
||||
min_timestamp = max([df_openbis[ob_datetime_var].min(), df_h5[h5_datetime_var].min()])
|
||||
max_timestamp = min([df_openbis[ob_datetime_var].max(), df_h5[h5_datetime_var].max()])
|
||||
|
||||
# Determine overlap between df and df_ref, and filters out all rows from df with datetime variable outside the overlapping datetime region.
|
||||
datetime_overlap_indicator = (df_h5[h5_datetime_var] >= min_timestamp) & (df_h5[h5_datetime_var] <= max_timestamp)
|
||||
df_h5 = df_h5.loc[datetime_overlap_indicator,:]
|
||||
|
||||
datetime_overlap_indicator = (df_openbis[ob_datetime_var] >= min_timestamp) & (df_openbis[ob_datetime_var] <= max_timestamp)
|
||||
df_openbis = df_openbis.loc[datetime_overlap_indicator,:]
|
||||
|
||||
df_h5 = df_h5.sort_values(by=h5_datetime_var)
|
||||
df_openbis = df_openbis.sort_values(by=ob_datetime_var)
|
||||
|
||||
return df_h5, df_openbis
|
||||
|
||||
def reformat_openbis_dataframe_filenumber(df_openbis):
|
||||
|
||||
if not 'FILENUMBER' in df_openbis.columns:
|
||||
raise ValueError('df_openbis does not contain the column "FILENUMBER". Make sure you query (e.g., o.get_samples(props=["filenumbe"])) that before creating df_openbis.')
|
||||
#if not 'name' in df.columns:
|
||||
# raise ValueError("df does not contain the column 'name'. Ensure df complies with Throsten's Table's format.")
|
||||
|
||||
# Augment df_openbis with 'name' column consitent with Thorsten's naming convention
|
||||
name_list = ['0' + item.zfill(3) + item.zfill(3) for item in df_openbis['FILENUMBER']]
|
||||
df_openbis['REFORMATED_FILENUMBER'] = pd.Series(name_list, index=df_openbis.index)
|
||||
|
||||
return df_openbis
|
||||
|
||||
def pair_openbis_and_h5_dataframes(df_openbis, df_h5, pairing_ob_var: str, pairing_h5_var: str):
|
||||
|
||||
""" Pairs every row (or openbis sample) in 'df_openbis' with a set of rows (or measurements) in 'df_h5' by matching the i-th row in 'df_h5'
|
||||
with the rows of 'df_h5' that satisfy the string df_openbis.loc[i,pairing_var_1] is contained in the string df_h5[i,pairing_var_2]
|
||||
|
||||
Example: pairing_var_1, pairing_var_2 = reformated 'REFORMATED_FILENUMBER', 'name'
|
||||
|
||||
"""
|
||||
# Reformat openbis dataframe filenumber so that it can be used to find associated measurements in h5 dataframe
|
||||
df_openbis = reformat_openbis_dataframe_filenumber(df_openbis)
|
||||
|
||||
related_indices_list = []
|
||||
for sample_idx in df_openbis.index:
|
||||
sample_value = df_openbis.loc[sample_idx,pairing_ob_var]
|
||||
tmp_list = [sample_value in item[0:item.find('_')] for item in df_h5[pairing_h5_var]]
|
||||
related_indices_list.append(df_h5.index[tmp_list])
|
||||
|
||||
print('Paring openbis sample: ' + df_openbis.loc[sample_idx,pairing_ob_var])
|
||||
print('with reformated FILENUMBER: ' + sample_value)
|
||||
print('to following measurements in h5 dataframe:')
|
||||
print(df_h5.loc[df_h5.index[tmp_list],'name'])
|
||||
print('\n')
|
||||
|
||||
df_openbis['related_h5_indices'] = pd.Series(related_indices_list, index=df_openbis.index)
|
||||
|
||||
return df_openbis
|
||||
|
||||
|
||||
def range_cols_2_string(df,lb_var,ub_var):
|
||||
|
||||
if not sum(df.loc[:,ub_var]-df.loc[:,lb_var])==0:
|
||||
#tmp_list = ['-'.join([str(round(df.loc[i,lb_var],2)),str(round(df.loc[i,ub_var],1))]) for i in df.index]
|
||||
tmp_list = ['-'.join(["{:.1f}".format(df.loc[i,lb_var]),"{:.1f}".format(df.loc[i,ub_var])]) for i in df.index]
|
||||
elif len(df.loc[:,lb_var].unique())>1: # check if values are different
|
||||
#tmp_list = [str(round(df.loc[i,lb_var],2)) for i in df.index]
|
||||
tmp_list = ["{:.1f}".format(df.loc[i,lb_var]) for i in df.index]
|
||||
else:
|
||||
#tmp_list = [str(round(df.loc[0,lb_var],2))]
|
||||
tmp_list = ["{:.1f}".format(df.loc[0,lb_var])]
|
||||
return '/'.join(tmp_list)
|
||||
|
||||
def col_2_string(df,column_var):
|
||||
|
||||
if not column_var in df.columns:
|
||||
raise ValueError("'column var must belong in df.columns")
|
||||
|
||||
#tmp_list = [str(round(item,1)) for item in df[column_var]]
|
||||
tmp_list = ["{:.2f}".format(item) for item in df[column_var]]
|
||||
if len(df[column_var].unique())==1:
|
||||
tmp_list = [tmp_list[0]]
|
||||
|
||||
return '/'.join(tmp_list)
|
||||
|
||||
|
||||
def compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx):
|
||||
|
||||
prop2attr = {'sample_name':'sample', # ask Throsten whether this assignment is correct or not
|
||||
'position_x':'smplX_mm',
|
||||
'position_y':'smplY_mm',
|
||||
'position_z':'smplZ_mm',
|
||||
'temp':'sampleTemp_dC',
|
||||
'cell_pressure':'cellPressure_mbar',
|
||||
#'gas_flow_setting': '',
|
||||
'method_name':'regionName', # measurement type: XPS or NEXAFS
|
||||
'region':'regionName', # VB/N1s/C1s
|
||||
'passenergy':'regionName', # REAL
|
||||
|
||||
'photon_energy':'xRayEkinRange_eV',
|
||||
'dwell_time':'scientaDwellTime_ms',
|
||||
'acq_mode':'scientaAcquisitionMode',
|
||||
'ke_range_center':'scientaEkinRange_eV',
|
||||
'ke_step':'scientaEkinStep_eV',
|
||||
'lens_mode':'scientaLensMode'
|
||||
}
|
||||
|
||||
sample_identifier = df_openbis.loc[sample_idx,'identifier']
|
||||
props_dict = {'FILENUMBER' : df_openbis.loc[sample_idx,'FILENUMBER']}
|
||||
|
||||
#props_dict = {}
|
||||
|
||||
if not len(df_openbis.loc[sample_idx,'related_h5_indices']):
|
||||
props_dict['identifier'] = sample_identifier
|
||||
return props_dict
|
||||
|
||||
reduced_df_h5 = df_h5.loc[df_openbis.loc[sample_idx,'related_h5_indices'],:]
|
||||
reduced_df_h5 = reduced_df_h5.reset_index()
|
||||
|
||||
# include related_samples key for validation purposes. Related samples are used to compute average and/or combined openbis properties.
|
||||
related_sample_list = [reduced_df_h5['name'][index] for index in reduced_df_h5['name'].index]
|
||||
related_samples = ' / '.join(related_sample_list)
|
||||
props_dict['Subject_samples'] = related_samples
|
||||
|
||||
props_dict['sample_name'] = reduced_df_h5['sample'].unique()[0] if len(reduced_df_h5['sample'].unique())==1 else '/'.join(reduced_df_h5['sample'].tolist())
|
||||
|
||||
if not 'NEXAFS' in reduced_df_h5['regionName'].iloc[0]:
|
||||
props_dict['identifier'] = sample_identifier
|
||||
props_dict['method_name'] = 'XPS'
|
||||
for item_idx in reduced_df_h5.index:
|
||||
item = reduced_df_h5.loc[item_idx,'regionName']
|
||||
if item_idx > 0:
|
||||
props_dict['region'] = props_dict['region'] + '/' + item[0:item.find('_')]
|
||||
#props_dict['dwell_time'] = props_dict['dwell_time'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
|
||||
#props_dict['ke_range_center'] = props_dict['ke_range_center'] + '/' + str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
|
||||
#props_dict['ke_step_center'] = props_dict['ke_step_center'] + '/' + str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
|
||||
#props_dict['passenergy'].append(float(item[item.find('_')+1:item.find('eV')]))
|
||||
else:
|
||||
props_dict['region'] = item[0:item.find('_')]
|
||||
#props_dict['dwell_time'] = str(reduced_df_h5.loc[item_idx,'scientaDwellTime_ms'])
|
||||
#props_dict['ke_range_center'] = str(round(reduced_df_h5.loc[item_idx,['scientaEkinRange_eV_1','scientaEkinRange_eV_2']].mean(),2))
|
||||
#props_dict['ke_step_center'] = str(reduced_df_h5.loc[item_idx,'scientaEkinStep_eV'])
|
||||
|
||||
#props_dict['passenergy'] = reduced_df_h5.loc[:,'scientaPassEnergy_eV'].min()
|
||||
|
||||
else:
|
||||
props_dict = {'identifier':sample_identifier,'method_name':'NEXAFS'}
|
||||
|
||||
|
||||
#props_dict['temp'] = round(reduced_df_h5['sampleTemp_dC'].mean(),2)
|
||||
#props_dict['cell_pressure'] = round(reduced_df_h5['cellPressure_mbar'].mean(),2)
|
||||
props_dict['temp'] = "{:.2f}".format(reduced_df_h5['sampleTemp_dC'].mean())
|
||||
props_dict['cell_pressure'] = "{:.2f}".format(reduced_df_h5['cellPressure_mbar'].mean())
|
||||
|
||||
reduced_df_h5['scientaDwellTime_ms'] = reduced_df_h5['scientaDwellTime_ms']*1e-3 # covert ms to seconds
|
||||
props_dict['dwell_time'] = col_2_string(reduced_df_h5,'scientaDwellTime_ms')
|
||||
props_dict['passenergy'] = col_2_string(reduced_df_h5,'scientaPassEnergy_eV')
|
||||
props_dict['ke_step_center'] = col_2_string(reduced_df_h5,'scientaEkinStep_eV')
|
||||
#props_dict['photon_energy'] =round(reduced_df_h5[['xRayEkinRange_eV_1','xRayEkinRange_eV_2']].mean(axis=1)[0],2)
|
||||
props_dict['photon_energy'] = range_cols_2_string(reduced_df_h5,'xRayEkinRange_eV_1','xRayEkinRange_eV_2')
|
||||
props_dict['ke_range_center'] = range_cols_2_string(reduced_df_h5,'scientaEkinRange_eV_1','scientaEkinRange_eV_2')
|
||||
|
||||
props_dict['lens_mode'] = reduced_df_h5['scientaLensMode'][0]
|
||||
props_dict['acq_mode'] = reduced_df_h5['scientaAcquisitionMode'][0]
|
||||
|
||||
props_dict['position_x'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplX_mm'].mean()) # round(reduced_df_h5.loc[:,'smplX_mm'].mean(),2)
|
||||
props_dict['position_y'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplY_mm'].mean())
|
||||
props_dict['position_z'] = "{:.2f}".format(reduced_df_h5.loc[:,'smplZ_mm'].mean())
|
||||
|
||||
|
||||
|
||||
return props_dict
|
||||
|
||||
|
||||
|
||||
def single_sample_update(sample_props_dict,sample_collection,props_include_list):
|
||||
|
||||
""" Updates sample in openbis database specified in sample_props_dict, which must belong in sample_collection (i.e., result of openbis_obj.get_samples(...)) """
|
||||
|
||||
try:
|
||||
sample_path_identifier = sample_props_dict['identifier'] #path-like index
|
||||
sample = sample_collection[sample_path_identifier]
|
||||
for prop in sample_props_dict.keys():
|
||||
if (prop in admissible_props_list) and (prop in props_include_list):
|
||||
sample.props[prop] = sample_props_dict[prop]
|
||||
sample.save()
|
||||
except Exception:
|
||||
logging.error(Exception)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def sample_batch_update(openbis_obj,sample_collection,df_openbis,df_h5,props_include_list):
|
||||
|
||||
""" See """
|
||||
|
||||
if not 'related_h5_indices' in df_openbis.columns:
|
||||
raise ValueError("Input dataframe 'df_openbis' must contain a column named 'related_h5_indeces', resulting from suitable proprocessing steps.")
|
||||
|
||||
# TODO: as a safeguard, create exclude list containing properties that must not be changed
|
||||
exclude_list = ['filenumber','FILENUMBER','identifier']
|
||||
for item in props_include_list:
|
||||
if item in exclude_list:
|
||||
props_include_list.remove(item)
|
||||
|
||||
trans = openbis_obj.new_transaction()
|
||||
for sample_idx in len(range(df_openbis['identifier'])):
|
||||
|
||||
props_dict = compute_openbis_sample_props_from_h5(df_openbis, df_h5, sample_idx)
|
||||
sample_path_identifier = props_dict['identifier'] #path-like index
|
||||
sample = sample_collection[sample_path_identifier]
|
||||
|
||||
for prop in props_dict.keys():
|
||||
if prop in props_include_list:
|
||||
sample.props[prop] = props_dict[prop]
|
||||
|
||||
trans.add(sample)
|
||||
|
||||
trans.commit()
|
||||
|
||||
return 0
|
||||
|
||||
def conduct_dataframe_preprocessing_steps(df_h5, df_openbis):
|
||||
|
||||
if not 'lastModifiedDatestr'in df_h5.columns:
|
||||
raise ValueError('')
|
||||
|
||||
df_h5, df_openbis = align_datetime_observation_windows(df_h5, df_openbis, 'lastModifiedDatestr' , 'registrationDate')
|
||||
df_openbis = pair_openbis_and_h5_dataframes(df_openbis, df_h5, 'REFORMATED_FILENUMBER', 'name')
|
||||
|
||||
return df_h5, df_openbis
|
||||
|
||||
|
||||
|
116
src/utils_bge.py
116
src/utils_bge.py
@ -1,58 +1,58 @@
|
||||
import scipy.optimize as sp_opt
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
def construct_mask(x, subinterval_list):
|
||||
|
||||
""" constructs a mask of length len(x) that indicates whether the entries of x lie within the subintervals,
|
||||
speficified in the subinterval_list.
|
||||
|
||||
Parameters:
|
||||
x (array_like):
|
||||
subinterval_list (list of two-element tuples):
|
||||
|
||||
Returns:
|
||||
mask (Bool array_like):
|
||||
|
||||
Usage:
|
||||
|
||||
x = np.array([0.0 0.25 0.5 0.75 1.5 2.0 2.5 3.0 3.5 4.0])
|
||||
subinterval_list = [(0.25,0.75),(2.5,3.5)]
|
||||
mask = contruct_mask(x,subinterval_list)
|
||||
|
||||
"""
|
||||
|
||||
mask = x < x.min()
|
||||
for subinterval in subinterval_list:
|
||||
mask = mask | ((x >= subinterval[0]) & (x <= subinterval[1]))
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
def estimate_background(x,y,mask,method: str):
|
||||
|
||||
"""fits a background model based on the values of x and y indicated by a mask using a method, among available options.
|
||||
|
||||
Parameters:
|
||||
x,y (array_like, e.g., np.array, pd.Series):
|
||||
mask (Bool array_like):
|
||||
method (str):
|
||||
|
||||
Returns:
|
||||
y_bg (array_like): values of the fitted model at x, or similarly the obtained background estimate
|
||||
|
||||
"""
|
||||
|
||||
if method == 'linear':
|
||||
def linear_model(x,m,b):
|
||||
return (m*x) + b
|
||||
|
||||
popt, pcov = sp_opt.curve_fit(linear_model,x[mask],y[mask])
|
||||
|
||||
y_bg = linear_model(x,*popt)
|
||||
|
||||
else:
|
||||
raise ValueError("Parameter 'method' can only be set as 'linear'. Future code releases may include more options. ")
|
||||
|
||||
return y_bg
|
||||
import scipy.optimize as sp_opt
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
def construct_mask(x, subinterval_list):
|
||||
|
||||
""" constructs a mask of length len(x) that indicates whether the entries of x lie within the subintervals,
|
||||
speficified in the subinterval_list.
|
||||
|
||||
Parameters:
|
||||
x (array_like):
|
||||
subinterval_list (list of two-element tuples):
|
||||
|
||||
Returns:
|
||||
mask (Bool array_like):
|
||||
|
||||
Usage:
|
||||
|
||||
x = np.array([0.0 0.25 0.5 0.75 1.5 2.0 2.5 3.0 3.5 4.0])
|
||||
subinterval_list = [(0.25,0.75),(2.5,3.5)]
|
||||
mask = contruct_mask(x,subinterval_list)
|
||||
|
||||
"""
|
||||
|
||||
mask = x < x.min()
|
||||
for subinterval in subinterval_list:
|
||||
mask = mask | ((x >= subinterval[0]) & (x <= subinterval[1]))
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
def estimate_background(x,y,mask,method: str):
|
||||
|
||||
"""fits a background model based on the values of x and y indicated by a mask using a method, among available options.
|
||||
|
||||
Parameters:
|
||||
x,y (array_like, e.g., np.array, pd.Series):
|
||||
mask (Bool array_like):
|
||||
method (str):
|
||||
|
||||
Returns:
|
||||
y_bg (array_like): values of the fitted model at x, or similarly the obtained background estimate
|
||||
|
||||
"""
|
||||
|
||||
if method == 'linear':
|
||||
def linear_model(x,m,b):
|
||||
return (m*x) + b
|
||||
|
||||
popt, pcov = sp_opt.curve_fit(linear_model,x[mask],y[mask])
|
||||
|
||||
y_bg = linear_model(x,*popt)
|
||||
|
||||
else:
|
||||
raise ValueError("Parameter 'method' can only be set as 'linear'. Future code releases may include more options. ")
|
||||
|
||||
return y_bg
|
||||
|
Reference in New Issue
Block a user