Made a few optimizations to code and documentation. Expressions relying on list comprehensions were simplified with generator expressions. ex,: any([keyword in filename for keyword in select_file_keywords]) was simplified to any(keyword in filename for keyword in select_file_keywords).

This commit is contained in:
2024-05-24 09:06:07 +02:00
parent d574ac382d
commit 1537633b1a

View File

@ -225,14 +225,6 @@ def annotate_root_dir(filename,annotation_dict: dict):
file.attrs.create('metadata_'+key, annotation_dict[key])
import shutil
#def create_hdf5_file_from_filesystem_path(config_param : dict ,
# input_file_system_path : str,
# select_dir_keywords = [],
# select_file_keywords =[],
# top_sub_dir_mask : bool = True):
def is_valid_directory_path(dirpath,select_dir_keywords):
activated_keywords = []
@ -256,9 +248,12 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
"""
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) of given a filesystem path.
When file and directory keywords are non-empty, the keywords enable filtering of directories and files that do not contain the specified keywords.
When file and directory keywords are non-empty, the keywords enable filtering of directory paths and file paths that do not contain the specified keywords.
In the .h5 file, only files that are admissible file formats will be stored in the form of datasets and attributes.
The data integration capabilities are limited by our file reader, which can only access data from a list of admissible file formats.
These however can be extended.
Directories are groups in the resultsing hdf5 file.
Files are formatted as composite object consisting of a group, file, and attributes.
Parameters:
@ -317,26 +312,28 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
# (directory_path, suitable files) relationships in a dictionary.
file_paths_dict = {}
check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
#check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
check_file_ext = lambda filename: os.path.splitext(filename)[1] in admissible_file_ext_list
for dirpath, _, filenames in os.walk(item,topdown=False):
file_paths_dict[dirpath] = []
# Check files that have an admissible extension and store them in admissible_filenames list
# Keep files that have an admissible extension and store them in admissible_filenames list
admissible_filenames = []
for fn in filenames:
if check_file_ext(fn):
admissible_filenames.append(fn)
if select_file_keywords: # when select_file_keywords = [], all files are considered
for filename in admissible_filenames:
# Do not consider files with types for which there is still no file_reader. TODO: extend file_reader library.
#if not any([ext in filename for ext in admissible_file_ext_list]):
# continue
#for filename in admissible_filenames:
for i in range(len(admissible_filenames) - 1, -1, -1):
filename = admissible_filenames[i]
# Add files with name, that contains any of the file_keywords
if any([keyword in filename for keyword in select_file_keywords]):
file_paths_dict[dirpath].append(filename)
# Remove files that with filename, not adhering to file keyword constraints.
if not any(keyword in filename for keyword in select_file_keywords):
admissible_filenames.pop(i)
file_paths_dict[dirpath] = admissible_filenames
else:
file_paths_dict[dirpath] = admissible_filenames
@ -344,21 +341,6 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
dirpath, dirnames, filenames_list = node
#if node_number == 0:
# offset = dirpath.count(os.sep)
# Filter out files with filenames not containing a keyword specified in the parameter 'select_file_keywords'.
# When select_file_keywords is an empty, i.e., [], do not apply any filter on the filenames.
#filtered_filename_list = []
#if select_file_keywords:
# for filename in filenames_list:
# if any([keyword in filename for keyword in select_file_keywords]):
# filtered_filename_list.append(filename)
#else:
# filtered_filename_list = filenames_list.copy()
filtered_filename_list = file_paths_dict.get(dirpath,filenames_list.copy())
@ -389,9 +371,6 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
print(group_name,' was already created.')
# TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)
for filenumber, filename in enumerate(filtered_filename_list):
# Get file extension (or file type)