diff --git a/src/hdf5_writer.py b/src/hdf5_writer.py index 175d288..960d447 100644 --- a/src/hdf5_writer.py +++ b/src/hdf5_writer.py @@ -100,6 +100,20 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, print(message) logging.error(message) else: + # Step 1: Preprocess all metadata.json files into a lookup dict + all_metadata_dict = {} + + for dirpath, filenames in path_to_filenames_dict.items(): + metadata_file = next((f for f in filenames if f.endswith('metadata.json')), None) + if metadata_file: + metadata_path = os.path.join(dirpath, metadata_file) + try: + with open(metadata_path, 'r') as metafile: + all_metadata_dict[dirpath] = json.load(metafile) + except json.JSONDecodeError: + logging.warning(f"Invalid JSON in metadata file: {metadata_path}") + all_metadata_dict[dirpath] = {} + with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file: number_of_dirs = len(path_to_filenames_dict.keys()) @@ -138,21 +152,14 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, stdout = inst logging.error('Failed to create group %s into HDF5: %s', group_name, inst) - if 'data_lineage_metadata.json' in filtered_filenames_list: - idx = filtered_filenames_list.index('data_lineage_metadata.json') - data_lineage_file = filtered_filenames_list[idx] - try: - with open('/'.join([dirpath,data_lineage_file]),'r') as dlf: - data_lineage_dict = json.load(dlf) - filtered_filenames_list.pop(idx) - except json.JSONDecodeError: - data_lineage_dict = {} # Start fresh if file is invalid - - else: - data_lineage_dict = {} - + # Step 3: During ingestion, attach metadata per file + metadata_dict = all_metadata_dict.get(dirpath, {}) for filenumber, filename in enumerate(filtered_filenames_list): + + # Skip any file that itself ends in metadata.json + if filename.endswith('metadata.json'): + continue # hdf5 path to filename group dest_group_name = f'{group_name}/{filename}' @@ -163,6 +170,10 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, #file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename)) file_dict = filereader_registry.select_file_reader(dest_group_name)(source_file_path) + # Attach per-file metadata if available + if filename in metadata_dict: + file_dict.get("attributes_dict",{}).update(metadata_dict[filename]) + stdout = hdf5_ops.save_file_dict_to_hdf5(dest_file_obj, group_name, file_dict) else: @@ -270,6 +281,21 @@ def create_hdf5_file_from_filesystem_path_new(path_to_input_directory: str, print(message) logging.error(message) else: + + # Step 1: Preprocess all metadata.json files into a lookup dict + all_metadata_dict = {} + + for dirpath, filenames in path_to_filenames_dict.items(): + metadata_file = next((f for f in filenames if f.endswith('metadata.json')), None) + if metadata_file: + metadata_path = os.path.join(dirpath, metadata_file) + try: + with open(metadata_path, 'r') as metafile: + all_metadata_dict[dirpath] = json.load(metafile) + except json.JSONDecodeError: + logging.warning(f"Invalid JSON in metadata file: {metadata_path}") + all_metadata_dict[dirpath] = {} + with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file: print('Created file') @@ -309,7 +335,14 @@ def create_hdf5_file_from_filesystem_path_new(path_to_input_directory: str, # stdout = inst # logging.error('Failed to create group %s into HDF5: %s', group_name, inst) + # Step 3: During ingestion, attach metadata per file + # TODO: pass this metadata fict to run_file_reader line 363 + metadata_dict = all_metadata_dict.get(dirpath, {}) + for filenumber, filename in enumerate(filtered_filenames_list): + + if filename.endswith('metadata.json'): + continue #file_ext = os.path.splitext(filename)[1] #try: