From b24d33ab15bc481ac6e6ad8bbfe5baa218b99486 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Sun, 24 Nov 2024 10:38:13 +0100 Subject: [PATCH] Check whether h5 file being written exists. If so, we do not overwrite it because it may be underdoing refinement, changes or updates, for archiving, sharing, or publishing. --- src/hdf5_writer.py | 142 ++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 67 deletions(-) diff --git a/src/hdf5_writer.py b/src/hdf5_writer.py index 86326b9..5c0aba8 100644 --- a/src/hdf5_writer.py +++ b/src/hdf5_writer.py @@ -162,79 +162,87 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, print(start_message) logging.info(start_message) - - with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file: + # Check if the .h5 file already exists + if os.path.exists(path_to_output_file) and mode in ['w']: + message = ( + f"[Notice] The file '{path_to_output_file}' already exists and will not be overwritten.\n" + "If you wish to replace it, please delete the existing file first and rerun the program." + ) + print(message) + logging.error(message) + else: + with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file: - number_of_dirs = len(path_to_filenames_dict.keys()) - dir_number = 1 - for dirpath, filtered_filenames_list in path_to_filenames_dict.items(): - - # Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict. - if not filtered_filenames_list: - continue + number_of_dirs = len(path_to_filenames_dict.keys()) + dir_number = 1 + for dirpath, filtered_filenames_list in path_to_filenames_dict.items(): + + # Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict. + if not filtered_filenames_list: + continue - group_name = dirpath.replace(os.sep,'/') - group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/') + group_name = dirpath.replace(os.sep,'/') + group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/') - # Flatten group name to one level - if select_dir_keywords: - offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords]) - else: - offset = 1 - tmp_list = group_name.split('/') - if len(tmp_list) > offset+1: - group_name = '/'.join([tmp_list[i] for i in range(offset+1)]) - - # Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes - if not group_name in h5file.keys(): - h5file.create_group(group_name) - h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8') - #h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list)) - #h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list)) - #else: - #print(group_name,' was already created.') - instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}' - print(instFoldermsgStart) - - for filenumber, filename in enumerate(filtered_filenames_list): - - #file_ext = os.path.splitext(filename)[1] - #try: - - # hdf5 path to filename group - dest_group_name = f'{group_name}/{filename}' - - if not 'h5' in filename: - #file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename)) - #file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename)) - file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename)) - - stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict) - + # Flatten group name to one level + if select_dir_keywords: + offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords]) else: - source_file_path = os.path.join(dirpath,filename) - dest_file_obj = h5file - #group_name +'/'+filename - #ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name) - #g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name) - stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False) + offset = 1 + tmp_list = group_name.split('/') + if len(tmp_list) > offset+1: + group_name = '/'.join([tmp_list[i] for i in range(offset+1)]) - # Update the progress bar and log the end message - instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n' - # Print and log the start message - utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd) - logging.info(instFoldermsdEnd ) - dir_number = dir_number + 1 + # Create group called "group_name". Hierarchy of nested groups can be implicitly defined by the forward slashes + if not group_name in h5file.keys(): + h5file.create_group(group_name) + h5file[group_name].attrs['creation_date'] = utils.created_at().encode('utf-8') + #h5file[group_name].attrs.create(name='filtered_file_list',data=convert_string_to_bytes(filtered_filename_list)) + #h5file[group_name].attrs.create(name='file_list',data=convert_string_to_bytes(filenames_list)) + #else: + #print(group_name,' was already created.') + instFoldermsgStart = f'Starting data transfer from instFolder: {group_name}' + print(instFoldermsgStart) - print('[End] Data integration') - logging.info('[End] Data integration') - - if len(root_metadata_dict.keys())>0: - for key, value in root_metadata_dict.items(): - #if key in h5file.attrs: - # del h5file.attrs[key] - h5file.attrs.create(key, value) - #annotate_root_dir(output_filename,root_metadata_dict) + for filenumber, filename in enumerate(filtered_filenames_list): + + #file_ext = os.path.splitext(filename)[1] + #try: + + # hdf5 path to filename group + dest_group_name = f'{group_name}/{filename}' + + if not 'h5' in filename: + #file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename)) + #file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename)) + file_dict = filereader_registry.select_file_reader(dest_group_name)(os.path.join(dirpath,filename)) + + stdout = __transfer_file_dict_to_hdf5(h5file, group_name, file_dict) + + else: + source_file_path = os.path.join(dirpath,filename) + dest_file_obj = h5file + #group_name +'/'+filename + #ext_to_reader_dict[file_ext](source_file_path, dest_file_obj, dest_group_name) + #g5505f_reader.select_file_reader(dest_group_name)(source_file_path, dest_file_obj, dest_group_name) + stdout = __copy_file_in_group(source_file_path, dest_file_obj, dest_group_name, False) + + # Update the progress bar and log the end message + instFoldermsdEnd = f'\nCompleted data transfer for instFolder: {group_name}\n' + # Print and log the start message + utils.progressBar(dir_number, number_of_dirs, instFoldermsdEnd) + logging.info(instFoldermsdEnd ) + dir_number = dir_number + 1 + + print('[End] Data integration') + logging.info('[End] Data integration') + + if len(root_metadata_dict.keys())>0: + for key, value in root_metadata_dict.items(): + #if key in h5file.attrs: + # del h5file.attrs[key] + h5file.attrs.create(key, value) + #annotate_root_dir(output_filename,root_metadata_dict) #output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)