From a6868d985db1c2366999b661c82bbd31d5b8d893 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Tue, 18 Jun 2024 09:21:46 +0200 Subject: [PATCH 1/6] Fixed bug regarding datetime to str column conversion in dataframe by using .map(srt) (element wise operation) as opposed to .apply(str) --- src/hdf5_lib.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index 7c498a1..cab8554 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -495,7 +495,9 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, """ # Convert datetime columns to string datetime_cols = df.select_dtypes(include=['datetime64']).columns - df[datetime_cols] = df[datetime_cols].apply(str) + + if list(datetime_cols): + df[datetime_cols] = df[datetime_cols].map(str) # Convert dataframe to structured array icad_data_table = utils.dataframe_to_np_structured_array(df) From 04558e77851daeda04735e0ce88b5c01da61636d Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Tue, 18 Jun 2024 14:42:51 +0200 Subject: [PATCH 2/6] Added code to parse dict attributes. --- src/hdf5_lib.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index cab8554..a7ab09b 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -481,7 +481,7 @@ import os #import src.hdf5_lib as h5lib import src.g5505_utils as utils import h5py - +import src.metadata_review_lib as metadata_lib def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, script_name): """ Save processed dataframe columns with annotations to an HDF5 file. @@ -516,6 +516,10 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, # Prepare data level attributes data_level_attributes = metadata_dict['metadata']['datasets'] + for key, value in data_level_attributes.items(): + if isinstance(value,dict): + data_level_attributes[key] = metadata_lib.parse_attribute(value) + # Generate output filename parent_file_name = os.path.split(src_hdf5_path)[1] output_filename = f'data_products/processed/fig_{script_date}_{parent_file_name}' From 06c5c6d84b0545994b26f9dc6bd521571172b110 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Wed, 19 Jun 2024 18:30:02 +0200 Subject: [PATCH 3/6] Incorporated method to MetadataHarvester class to collect project level metadata. --- src/metadata_review_lib.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py index 17f47df..483e564 100644 --- a/src/metadata_review_lib.py +++ b/src/metadata_review_lib.py @@ -322,12 +322,16 @@ class MetadataHarvester: parent_files = [] self.parent_files = parent_files self.metadata = { + "project": {}, "sample": {}, "environment": {}, "instruments": {}, "datasets": {} } + def add_project_info(self, key_or_dict, value=None): + self._add_info("project", key_or_dict, value) + def add_sample_info(self, key_or_dict, value=None): self._add_info("sample", key_or_dict, value) @@ -365,6 +369,7 @@ class MetadataHarvester: def clear_metadata(self): self.metadata = { + "project": {}, "sample": {}, "environment": {}, "instruments": {}, From 498a51cbc696a8582c23141853c82d038ab6b256 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Wed, 19 Jun 2024 18:31:11 +0200 Subject: [PATCH 4/6] Updated function to add project level metadata at the root group of the hdf5 file. --- src/hdf5_lib.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index a7ab09b..92babfe 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -505,6 +505,10 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, # Get metadata metadata_dict = annotator.get_metadata() + # Prepare project level attributes + + root_level_attributes = metadata_dict['metadata']['project'] + # Prepare high-level attributes high_level_attributes = { 'parent_files': metadata_dict['parent_files'], @@ -538,6 +542,7 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, # Write to HDF5 with h5py.File(output_filename, 'w') as h5file: + h5file.attrs.update(root_level_attributes) transfer_file_dict_to_hdf5(h5file, '/', file_dict) From 106795ae598df67a69a892ffba45e1597ec7c199 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Thu, 20 Jun 2024 09:03:47 +0200 Subject: [PATCH 5/6] Added a few lines to detect the existence of the file and change the file mode from 'w' to 'a' based on that information. --- src/hdf5_lib.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index 92babfe..8e58bf0 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -482,7 +482,7 @@ import os import src.g5505_utils as utils import h5py import src.metadata_review_lib as metadata_lib -def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, script_name): +def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name): """ Save processed dataframe columns with annotations to an HDF5 file. @@ -505,9 +505,9 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, # Get metadata metadata_dict = annotator.get_metadata() - # Prepare project level attributes + # Prepare project level attributes to be added at the root level - root_level_attributes = metadata_dict['metadata']['project'] + project_level_attributes = metadata_dict['metadata']['project'] # Prepare high-level attributes high_level_attributes = { @@ -524,13 +524,10 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, if isinstance(value,dict): data_level_attributes[key] = metadata_lib.parse_attribute(value) - # Generate output filename - parent_file_name = os.path.split(src_hdf5_path)[1] - output_filename = f'data_products/processed/fig_{script_date}_{parent_file_name}' # Prepare file dictionary file_dict = { - 'name': script_name, + 'name': project_level_attributes['script_name'], 'attributes_dict': high_level_attributes, 'datasets': [{ 'name': "data_table", @@ -540,9 +537,19 @@ def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, }] } + # Check if the file exists + if os.path.exists(output_filename): + mode = "a" + print(f"File {output_filename} exists. Opening in append mode.") + else: + mode = "w" + print(f"File {output_filename} does not exist. Creating a new file.") + + # Write to HDF5 - with h5py.File(output_filename, 'w') as h5file: - h5file.attrs.update(root_level_attributes) + with h5py.File(output_filename, mode) as h5file: + # Add project level attributes at the root/top level + h5file.attrs.update(project_level_attributes) transfer_file_dict_to_hdf5(h5file, '/', file_dict) From cedfe614e74759808a37a6d30f6c44019430a5c0 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Thu, 20 Jun 2024 15:32:33 +0200 Subject: [PATCH 6/6] Implemented input argument to enable append information to exisintg attributes, which must take the values of either strings or lists. --- src/metadata_review_lib.py | 45 ++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py index 483e564..5744a55 100644 --- a/src/metadata_review_lib.py +++ b/src/metadata_review_lib.py @@ -329,27 +329,48 @@ class MetadataHarvester: "datasets": {} } - def add_project_info(self, key_or_dict, value=None): - self._add_info("project", key_or_dict, value) + def add_project_info(self, key_or_dict, value=None, append=False): + self._add_info("project", key_or_dict, value, append) - def add_sample_info(self, key_or_dict, value=None): - self._add_info("sample", key_or_dict, value) + def add_sample_info(self, key_or_dict, value=None, append=False): + self._add_info("sample", key_or_dict, value, append) - def add_environment_info(self, key_or_dict, value=None): - self._add_info("environment", key_or_dict, value) + def add_environment_info(self, key_or_dict, value=None, append=False): + self._add_info("environment", key_or_dict, value, append) - def add_instrument_info(self, key_or_dict, value=None): - self._add_info("instruments", key_or_dict, value) + def add_instrument_info(self, key_or_dict, value=None, append=False): + self._add_info("instruments", key_or_dict, value, append) - def add_dataset_info(self, key_or_dict, value=None): - self._add_info("datasets", key_or_dict, value) + def add_dataset_info(self, key_or_dict, value=None, append=False): + self._add_info("datasets", key_or_dict, value, append) - def _add_info(self, category, key_or_dict, value): + def _add_info(self, category, key_or_dict, value, append): """Internal helper method to add information to a category.""" if isinstance(key_or_dict, dict): self.metadata[category].update(key_or_dict) else: - self.metadata[category][key_or_dict] = value + if key_or_dict in self.metadata[category]: + if append: + current_value = self.metadata[category][key_or_dict] + + if isinstance(current_value, list): + + if not isinstance(value, list): + # Append the new value to the list + self.metadata[category][key_or_dict].append(value) + else: + self.metadata[category][key_or_dict] = current_value + value + + elif isinstance(current_value, str): + # Append the new value as a comma-separated string + self.metadata[category][key_or_dict] = current_value + ',' + str(value) + else: + # Handle other types (for completeness, usually not required) + self.metadata[category][key_or_dict] = [current_value, value] + else: + self.metadata[category][key_or_dict] = value + else: + self.metadata[category][key_or_dict] = value def get_metadata(self): return {