This commit is contained in:
2024-06-21 15:42:46 +02:00
2 changed files with 62 additions and 18 deletions

View File

@ -481,8 +481,8 @@ import os
#import src.hdf5_lib as h5lib
import src.g5505_utils as utils
import h5py
def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, script_name):
import src.metadata_review_lib as metadata_lib
def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
"""
Save processed dataframe columns with annotations to an HDF5 file.
@ -495,7 +495,9 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date,
"""
# Convert datetime columns to string
datetime_cols = df.select_dtypes(include=['datetime64']).columns
df[datetime_cols] = df[datetime_cols].apply(str)
if list(datetime_cols):
df[datetime_cols] = df[datetime_cols].map(str)
# Convert dataframe to structured array
icad_data_table = utils.dataframe_to_np_structured_array(df)
@ -503,6 +505,10 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date,
# Get metadata
metadata_dict = annotator.get_metadata()
# Prepare project level attributes to be added at the root level
project_level_attributes = metadata_dict['metadata']['project']
# Prepare high-level attributes
high_level_attributes = {
'parent_files': metadata_dict['parent_files'],
@ -514,13 +520,14 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date,
# Prepare data level attributes
data_level_attributes = metadata_dict['metadata']['datasets']
# Generate output filename
parent_file_name = os.path.split(src_hdf5_path)[1]
output_filename = f'data_products/processed/fig_{script_date}_{parent_file_name}'
for key, value in data_level_attributes.items():
if isinstance(value,dict):
data_level_attributes[key] = metadata_lib.parse_attribute(value)
# Prepare file dictionary
file_dict = {
'name': script_name,
'name': project_level_attributes['script_name'],
'attributes_dict': high_level_attributes,
'datasets': [{
'name': "data_table",
@ -530,8 +537,19 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date,
}]
}
# Check if the file exists
if os.path.exists(output_filename):
mode = "a"
print(f"File {output_filename} exists. Opening in append mode.")
else:
mode = "w"
print(f"File {output_filename} does not exist. Creating a new file.")
# Write to HDF5
with h5py.File(output_filename, 'w') as h5file:
with h5py.File(output_filename, mode) as h5file:
# Add project level attributes at the root/top level
h5file.attrs.update(project_level_attributes)
transfer_file_dict_to_hdf5(h5file, '/', file_dict)

View File

@ -322,30 +322,55 @@ class MetadataHarvester:
parent_files = []
self.parent_files = parent_files
self.metadata = {
"project": {},
"sample": {},
"environment": {},
"instruments": {},
"datasets": {}
}
def add_sample_info(self, key_or_dict, value=None):
self._add_info("sample", key_or_dict, value)
def add_project_info(self, key_or_dict, value=None, append=False):
self._add_info("project", key_or_dict, value, append)
def add_environment_info(self, key_or_dict, value=None):
self._add_info("environment", key_or_dict, value)
def add_sample_info(self, key_or_dict, value=None, append=False):
self._add_info("sample", key_or_dict, value, append)
def add_instrument_info(self, key_or_dict, value=None):
self._add_info("instruments", key_or_dict, value)
def add_environment_info(self, key_or_dict, value=None, append=False):
self._add_info("environment", key_or_dict, value, append)
def add_dataset_info(self, key_or_dict, value=None):
self._add_info("datasets", key_or_dict, value)
def add_instrument_info(self, key_or_dict, value=None, append=False):
self._add_info("instruments", key_or_dict, value, append)
def _add_info(self, category, key_or_dict, value):
def add_dataset_info(self, key_or_dict, value=None, append=False):
self._add_info("datasets", key_or_dict, value, append)
def _add_info(self, category, key_or_dict, value, append):
"""Internal helper method to add information to a category."""
if isinstance(key_or_dict, dict):
self.metadata[category].update(key_or_dict)
else:
self.metadata[category][key_or_dict] = value
if key_or_dict in self.metadata[category]:
if append:
current_value = self.metadata[category][key_or_dict]
if isinstance(current_value, list):
if not isinstance(value, list):
# Append the new value to the list
self.metadata[category][key_or_dict].append(value)
else:
self.metadata[category][key_or_dict] = current_value + value
elif isinstance(current_value, str):
# Append the new value as a comma-separated string
self.metadata[category][key_or_dict] = current_value + ',' + str(value)
else:
# Handle other types (for completeness, usually not required)
self.metadata[category][key_or_dict] = [current_value, value]
else:
self.metadata[category][key_or_dict] = value
else:
self.metadata[category][key_or_dict] = value
def get_metadata(self):
return {
@ -365,6 +390,7 @@ class MetadataHarvester:
def clear_metadata(self):
self.metadata = {
"project": {},
"sample": {},
"environment": {},
"instruments": {},