Reimplemented file filtering, first file extension contraints are imposed and then file keyword contraints.
This commit is contained in:
@ -268,24 +268,23 @@ def create_hdf5_file_from_filesystem_path(config_param : dict ,
|
|||||||
raise ValueError('input_file_system_path needs to be specified using forward slashes "/".' )
|
raise ValueError('input_file_system_path needs to be specified using forward slashes "/".' )
|
||||||
|
|
||||||
for i, keyword in enumerate(select_dir_keywords):
|
for i, keyword in enumerate(select_dir_keywords):
|
||||||
select_dir_keywords[i] = keyword.replace('/',os.sep)
|
select_dir_keywords[i] = keyword.replace('/',os.sep)
|
||||||
|
|
||||||
|
# Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower
|
||||||
|
# level directories.
|
||||||
|
|
||||||
with h5py.File(output_filename, 'w') as h5file:
|
# Constrain walkable paths on the specified directory tree by allowing walks that start from root
|
||||||
|
# through subdirectories specified by dir_keywords. This improves efficiency especially, in deep
|
||||||
|
# directory trees with many leaves.
|
||||||
|
paths = []
|
||||||
|
if top_sub_dir_mask:
|
||||||
|
for item in os.listdir(input_file_system_path):
|
||||||
|
if any([item in keyword for keyword in select_dir_keywords]):
|
||||||
|
paths.append(os.path.join(input_file_system_path,item))
|
||||||
|
else:
|
||||||
|
paths.append(input_file_system_path)
|
||||||
|
|
||||||
# Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower
|
with h5py.File(output_filename, 'w') as h5file:
|
||||||
# level directories.
|
|
||||||
|
|
||||||
# Constrain walkable paths on the specified directory tree by allowing walks that start from root
|
|
||||||
# through subdirectories specified by dir_keywords. This improves efficiency especially, in deep
|
|
||||||
# directory trees with many leaves.
|
|
||||||
paths = []
|
|
||||||
if top_sub_dir_mask:
|
|
||||||
for item in os.listdir(input_file_system_path):
|
|
||||||
if any([item in keyword for keyword in select_dir_keywords]):
|
|
||||||
paths.append(os.path.join(input_file_system_path,item))
|
|
||||||
else:
|
|
||||||
paths.append(input_file_system_path)
|
|
||||||
|
|
||||||
for item in paths:
|
for item in paths:
|
||||||
|
|
||||||
@ -294,23 +293,34 @@ def create_hdf5_file_from_filesystem_path(config_param : dict ,
|
|||||||
# Create dictionary with directory-files pairs where files satisfy keyword and admisible type contraints
|
# Create dictionary with directory-files pairs where files satisfy keyword and admisible type contraints
|
||||||
# It requires an extra pass over directory three and additional memory for dictionary, but it may be useful
|
# It requires an extra pass over directory three and additional memory for dictionary, but it may be useful
|
||||||
# to speed up subsequent step and prune resulting directory tree.
|
# to speed up subsequent step and prune resulting directory tree.
|
||||||
file_paths_dict = {}
|
|
||||||
if select_file_keywords:
|
|
||||||
for dirpath, _, filenames_list in os.walk(item,topdown=False):
|
|
||||||
file_paths_dict[dirpath] = []
|
|
||||||
for filename in filenames_list:
|
|
||||||
|
|
||||||
if not any([ext in filename for ext in admissible_file_ext_list]):
|
# For each directory and/or subdirectory, keep files that satisfy file_keyword constraints, and store
|
||||||
continue
|
# (directory_path, suitable files) relationships in a dictionary.
|
||||||
|
file_paths_dict = {}
|
||||||
|
|
||||||
|
check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
|
||||||
|
|
||||||
|
for dirpath, _, filenames in os.walk(item,topdown=False):
|
||||||
|
file_paths_dict[dirpath] = []
|
||||||
|
|
||||||
|
# Check files that have an admissible extension and store them in admissible_filenames list
|
||||||
|
admissible_filenames = []
|
||||||
|
for fn in filenames:
|
||||||
|
if check_file_ext(fn):
|
||||||
|
admissible_filenames.append(fn)
|
||||||
|
|
||||||
|
if select_file_keywords: # when select_file_keywords = [], all files are considered
|
||||||
|
for filename in admissible_filenames:
|
||||||
|
# Do not consider files with types for which there is still no file_reader. TODO: extend file_reader library.
|
||||||
|
#if not any([ext in filename for ext in admissible_file_ext_list]):
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# Add files with name, that contains any of the file_keywords
|
||||||
if any([keyword in filename for keyword in select_file_keywords]):
|
if any([keyword in filename for keyword in select_file_keywords]):
|
||||||
file_paths_dict[dirpath].append(filename)
|
file_paths_dict[dirpath].append(filename)
|
||||||
|
else:
|
||||||
#admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys())
|
file_paths_dict[dirpath] = admissible_filenames
|
||||||
#for filename in filtered_filename_list.copy():
|
|
||||||
# if not any([ext in filename for ext in admissible_file_ext_list]):
|
|
||||||
# filtered_filename_list.remove(filename)
|
|
||||||
|
|
||||||
for node_number, node in enumerate(os.walk(item, topdown=True)):
|
for node_number, node in enumerate(os.walk(item, topdown=True)):
|
||||||
|
|
||||||
dirpath, dirnames, filenames_list = node
|
dirpath, dirnames, filenames_list = node
|
||||||
|
Reference in New Issue
Block a user