diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index c623ee9..8c90232 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -238,11 +238,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n if 'tmp_files' in tmp_file_path: os.remove(tmp_file_path) -def create_hdf5_file_from_filesystem_path(output_filename : str, - path_to_input_directory: str, - select_dir_keywords = [], - select_file_keywords =[], - top_sub_dir_mask : bool = True, +def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, + path_to_filenames_dict: dict = None, + select_dir_keywords : list = [], root_metadata_dict : dict = {}): """ @@ -257,17 +255,17 @@ def create_hdf5_file_from_filesystem_path(output_filename : str, ---------- output_filename : str Name of the output HDF5 file. - path_to_input_directory: str + path_to_input_directory : str Path to root directory, specified with forward slashes, e.g., path/to/root. + + path_to_filenames_dict : dict, optional + A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files. + If provided, 'input_file_system_path' is ignored. + select_dir_keywords : list List of string elements to consider or select only directory paths that contain a word in 'select_dir_keywords'. When empty, all directory paths are considered to be included in the HDF5 file group hierarchy. - select_file_keywords : list - List of string elements to consider or select only files that contain a word in - 'select_file_keywords'. When empty, all files are considered to be stored in the HDF5 file. - top_sub_dir_mask : bool - Mask for top-level subdirectories. root_metadata_dict : dict Metadata to include at the root level of the HDF5 file. @@ -277,7 +275,9 @@ def create_hdf5_file_from_filesystem_path(output_filename : str, Path to the created HDF5 file. """ - allowed_file_extensions = filereader_registry.file_extensions + output_filename = path_to_input_directory + '.h5' + + if '/' in path_to_input_directory: path_to_input_directory= path_to_input_directory.replace('/',os.sep) else: @@ -287,12 +287,12 @@ def create_hdf5_file_from_filesystem_path(output_filename : str, select_dir_keywords[i] = keyword.replace('/',os.sep) # Copy input_directory into the output_dir_path, and work with it from now on - output_dir_path = os.path.splitext(output_filename)[0].replace('/',os.sep) - path_to_filenames_dict = utils.copy_directory_with_contraints(path_to_input_directory, - output_dir_path, - select_dir_keywords, - select_file_keywords, - allowed_file_extensions) + output_dir_path = os.path.splitext(output_filename)[0].replace('/',os.sep) + + if not path_to_filenames_dict: + path_to_filenames_dict = utils.copy_directory_with_contraints(path_to_input_directory, + output_dir_path, + dry_run=True) # Set input_directory as copied input directory root_dir = output_dir_path @@ -307,10 +307,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str, # Print and log the start message print(start_message) logging.info(start_message) - - # Check if dirpath is valid. TODO: This is perhaps redundant by design of path_to_filenames_dict. - if not is_valid_directory_path(dirpath,select_dir_keywords): - continue + # Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict. if not filtered_filenames_list: continue @@ -319,7 +316,10 @@ def create_hdf5_file_from_filesystem_path(output_filename : str, group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/') # Flatten group name to one level - offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords]) + if select_dir_keywords: + offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords]) + else: + offset = 1 tmp_list = group_name.split('/') if len(tmp_list) > offset+1: group_name = '/'.join([tmp_list[i] for i in range(offset+1)])