From 68a9928c391af0f250fe96d590f979f953ee20d0 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Mon, 10 Feb 2025 15:52:17 +0100 Subject: [PATCH 1/4] Enable boolean type columns from pandas DataFrame to be suitably converted into numpy structured array --- utils/g5505_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/g5505_utils.py b/utils/g5505_utils.py index a145df1..b413271 100644 --- a/utils/g5505_utils.py +++ b/utils/g5505_utils.py @@ -161,6 +161,8 @@ def convert_dataframe_to_np_structured_array(df: pd.DataFrame): dtype.append((col, 'i4')) # Assuming 32-bit integer elif pd.api.types.is_float_dtype(col_dtype): dtype.append((col, 'f4')) # Assuming 32-bit float + elif pd.api.types.is_bool_dtype(col_dtype): + dtype.append((col,bool)) else: # Handle unsupported data types print(f"Unsupported dtype found in column '{col}': {col_data.dtype}") From 8ce6f588dc4889f32d0d60375c4e2cef971b1843 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Mon, 10 Feb 2025 15:56:34 +0100 Subject: [PATCH 2/4] Implement data_lineage_metadata.json detection and then use it to annotate associated file. --- src/hdf5_writer.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/hdf5_writer.py b/src/hdf5_writer.py index 3006b7e..0cb3602 100644 --- a/src/hdf5_writer.py +++ b/src/hdf5_writer.py @@ -7,6 +7,7 @@ import pandas as pd import numpy as np import h5py import logging +import json import utils.g5505_utils as utils import instruments.readers.filereader_registry as filereader_registry @@ -209,11 +210,22 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, stdout = inst logging.error('Failed to create group %s into HDF5: %s', group_name, inst) + if 'data_lineage_metadata.json' in filtered_filenames_list: + idx = filtered_filenames_list.index('data_lineage_metadata.json') + data_lineage_file = filtered_filenames_list[idx] + try: + with open('/'.join([dirpath,data_lineage_file]),'r') as dlf: + data_lineage_dict = json.load(dlf) + filtered_filenames_list.pop(idx) + except json.JSONDecodeError: + data_lineage_dict = {} # Start fresh if file is invalid + + else: + data_lineage_dict = {} + + for filenumber, filename in enumerate(filtered_filenames_list): - #file_ext = os.path.splitext(filename)[1] - #try: - # hdf5 path to filename group dest_group_name = f'{group_name}/{filename}' @@ -221,6 +233,10 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, #file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename)) #file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename)) file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename)) + # Check whether there is an available file reader + if file_dict is not None and isinstance(file_dict, dict): + if 'attributes_dict' in file_dict: + file_dict['attributes_dict'].update(data_lineage_dict.get(filename,{})) stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict) From 6ebc699a433eb8ca3cab6abadf3edcec8a252c7b Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Sat, 22 Feb 2025 17:53:19 +0100 Subject: [PATCH 3/4] Moved filereader_registry.py outside readers folder. --- instruments/{readers => }/filereader_registry.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename instruments/{readers => }/filereader_registry.py (100%) diff --git a/instruments/readers/filereader_registry.py b/instruments/filereader_registry.py similarity index 100% rename from instruments/readers/filereader_registry.py rename to instruments/filereader_registry.py From 1e67745fa4697c4cd64e97a9e8c4368ad11827f2 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Sat, 22 Feb 2025 17:59:00 +0100 Subject: [PATCH 4/4] Fix import for filereader_registry.py after moving it from intruments/readers/ one level above. --- pipelines/data_integration.py | 2 +- src/hdf5_writer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/data_integration.py b/pipelines/data_integration.py index cd453c0..bb3a37d 100644 --- a/pipelines/data_integration.py +++ b/pipelines/data_integration.py @@ -24,7 +24,7 @@ from itertools import chain # Import DIMA modules import src.hdf5_writer as hdf5_lib import utils.g5505_utils as utils -from instruments.readers import filereader_registry +from instruments import filereader_registry allowed_file_extensions = filereader_registry.file_extensions diff --git a/src/hdf5_writer.py b/src/hdf5_writer.py index 3006b7e..ed3a8c5 100644 --- a/src/hdf5_writer.py +++ b/src/hdf5_writer.py @@ -9,7 +9,7 @@ import h5py import logging import utils.g5505_utils as utils -import instruments.readers.filereader_registry as filereader_registry +import instruments.filereader_registry as filereader_registry