diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index 7c498a1..8e58bf0 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -481,8 +481,8 @@ import os #import src.hdf5_lib as h5lib import src.g5505_utils as utils import h5py - -def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, script_name): +import src.metadata_review_lib as metadata_lib +def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name): """ Save processed dataframe columns with annotations to an HDF5 file. @@ -495,7 +495,9 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, """ # Convert datetime columns to string datetime_cols = df.select_dtypes(include=['datetime64']).columns - df[datetime_cols] = df[datetime_cols].apply(str) + + if list(datetime_cols): + df[datetime_cols] = df[datetime_cols].map(str) # Convert dataframe to structured array icad_data_table = utils.dataframe_to_np_structured_array(df) @@ -503,6 +505,10 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, # Get metadata metadata_dict = annotator.get_metadata() + # Prepare project level attributes to be added at the root level + + project_level_attributes = metadata_dict['metadata']['project'] + # Prepare high-level attributes high_level_attributes = { 'parent_files': metadata_dict['parent_files'], @@ -514,13 +520,14 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, # Prepare data level attributes data_level_attributes = metadata_dict['metadata']['datasets'] - # Generate output filename - parent_file_name = os.path.split(src_hdf5_path)[1] - output_filename = f'data_products/processed/fig_{script_date}_{parent_file_name}' + for key, value in data_level_attributes.items(): + if isinstance(value,dict): + data_level_attributes[key] = metadata_lib.parse_attribute(value) + # Prepare file dictionary file_dict = { - 'name': script_name, + 'name': project_level_attributes['script_name'], 'attributes_dict': high_level_attributes, 'datasets': [{ 'name': "data_table", @@ -530,8 +537,19 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, }] } + # Check if the file exists + if os.path.exists(output_filename): + mode = "a" + print(f"File {output_filename} exists. Opening in append mode.") + else: + mode = "w" + print(f"File {output_filename} does not exist. Creating a new file.") + + # Write to HDF5 - with h5py.File(output_filename, 'w') as h5file: + with h5py.File(output_filename, mode) as h5file: + # Add project level attributes at the root/top level + h5file.attrs.update(project_level_attributes) transfer_file_dict_to_hdf5(h5file, '/', file_dict) diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py index 17f47df..5744a55 100644 --- a/src/metadata_review_lib.py +++ b/src/metadata_review_lib.py @@ -322,30 +322,55 @@ class MetadataHarvester: parent_files = [] self.parent_files = parent_files self.metadata = { + "project": {}, "sample": {}, "environment": {}, "instruments": {}, "datasets": {} } - def add_sample_info(self, key_or_dict, value=None): - self._add_info("sample", key_or_dict, value) + def add_project_info(self, key_or_dict, value=None, append=False): + self._add_info("project", key_or_dict, value, append) - def add_environment_info(self, key_or_dict, value=None): - self._add_info("environment", key_or_dict, value) + def add_sample_info(self, key_or_dict, value=None, append=False): + self._add_info("sample", key_or_dict, value, append) - def add_instrument_info(self, key_or_dict, value=None): - self._add_info("instruments", key_or_dict, value) + def add_environment_info(self, key_or_dict, value=None, append=False): + self._add_info("environment", key_or_dict, value, append) - def add_dataset_info(self, key_or_dict, value=None): - self._add_info("datasets", key_or_dict, value) + def add_instrument_info(self, key_or_dict, value=None, append=False): + self._add_info("instruments", key_or_dict, value, append) - def _add_info(self, category, key_or_dict, value): + def add_dataset_info(self, key_or_dict, value=None, append=False): + self._add_info("datasets", key_or_dict, value, append) + + def _add_info(self, category, key_or_dict, value, append): """Internal helper method to add information to a category.""" if isinstance(key_or_dict, dict): self.metadata[category].update(key_or_dict) else: - self.metadata[category][key_or_dict] = value + if key_or_dict in self.metadata[category]: + if append: + current_value = self.metadata[category][key_or_dict] + + if isinstance(current_value, list): + + if not isinstance(value, list): + # Append the new value to the list + self.metadata[category][key_or_dict].append(value) + else: + self.metadata[category][key_or_dict] = current_value + value + + elif isinstance(current_value, str): + # Append the new value as a comma-separated string + self.metadata[category][key_or_dict] = current_value + ',' + str(value) + else: + # Handle other types (for completeness, usually not required) + self.metadata[category][key_or_dict] = [current_value, value] + else: + self.metadata[category][key_or_dict] = value + else: + self.metadata[category][key_or_dict] = value def get_metadata(self): return { @@ -365,6 +390,7 @@ class MetadataHarvester: def clear_metadata(self): self.metadata = { + "project": {}, "sample": {}, "environment": {}, "instruments": {},