diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index 79ef5e0..386d5a4 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -268,24 +268,23 @@ def create_hdf5_file_from_filesystem_path(config_param : dict , raise ValueError('input_file_system_path needs to be specified using forward slashes "/".' ) for i, keyword in enumerate(select_dir_keywords): - select_dir_keywords[i] = keyword.replace('/',os.sep) + select_dir_keywords[i] = keyword.replace('/',os.sep) + # Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower + # level directories. - with h5py.File(output_filename, 'w') as h5file: + # Constrain walkable paths on the specified directory tree by allowing walks that start from root + # through subdirectories specified by dir_keywords. This improves efficiency especially, in deep + # directory trees with many leaves. + paths = [] + if top_sub_dir_mask: + for item in os.listdir(input_file_system_path): + if any([item in keyword for keyword in select_dir_keywords]): + paths.append(os.path.join(input_file_system_path,item)) + else: + paths.append(input_file_system_path) - # Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower - # level directories. - - # Constrain walkable paths on the specified directory tree by allowing walks that start from root - # through subdirectories specified by dir_keywords. This improves efficiency especially, in deep - # directory trees with many leaves. - paths = [] - if top_sub_dir_mask: - for item in os.listdir(input_file_system_path): - if any([item in keyword for keyword in select_dir_keywords]): - paths.append(os.path.join(input_file_system_path,item)) - else: - paths.append(input_file_system_path) + with h5py.File(output_filename, 'w') as h5file: for item in paths: @@ -294,23 +293,34 @@ def create_hdf5_file_from_filesystem_path(config_param : dict , # Create dictionary with directory-files pairs where files satisfy keyword and admisible type contraints # It requires an extra pass over directory three and additional memory for dictionary, but it may be useful # to speed up subsequent step and prune resulting directory tree. - file_paths_dict = {} - if select_file_keywords: - for dirpath, _, filenames_list in os.walk(item,topdown=False): - file_paths_dict[dirpath] = [] - for filename in filenames_list: - if not any([ext in filename for ext in admissible_file_ext_list]): - continue + # For each directory and/or subdirectory, keep files that satisfy file_keyword constraints, and store + # (directory_path, suitable files) relationships in a dictionary. + file_paths_dict = {} + check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list]) + + for dirpath, _, filenames in os.walk(item,topdown=False): + file_paths_dict[dirpath] = [] + + # Check files that have an admissible extension and store them in admissible_filenames list + admissible_filenames = [] + for fn in filenames: + if check_file_ext(fn): + admissible_filenames.append(fn) + + if select_file_keywords: # when select_file_keywords = [], all files are considered + for filename in admissible_filenames: + # Do not consider files with types for which there is still no file_reader. TODO: extend file_reader library. + #if not any([ext in filename for ext in admissible_file_ext_list]): + # continue + + # Add files with name, that contains any of the file_keywords if any([keyword in filename for keyword in select_file_keywords]): file_paths_dict[dirpath].append(filename) - - #admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys()) - #for filename in filtered_filename_list.copy(): - # if not any([ext in filename for ext in admissible_file_ext_list]): - # filtered_filename_list.remove(filename) - + else: + file_paths_dict[dirpath] = admissible_filenames + for node_number, node in enumerate(os.walk(item, topdown=True)): dirpath, dirnames, filenames_list = node