Made a few optimizations to code and documentation. Expressions relying on list comprehensions were simplified with generator expressions. ex,: any([keyword in filename for keyword in select_file_keywords]) was simplified to any(keyword in filename for keyword in select_file_keywords).
This commit is contained in:
@ -225,14 +225,6 @@ def annotate_root_dir(filename,annotation_dict: dict):
|
|||||||
file.attrs.create('metadata_'+key, annotation_dict[key])
|
file.attrs.create('metadata_'+key, annotation_dict[key])
|
||||||
|
|
||||||
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
#def create_hdf5_file_from_filesystem_path(config_param : dict ,
|
|
||||||
# input_file_system_path : str,
|
|
||||||
# select_dir_keywords = [],
|
|
||||||
# select_file_keywords =[],
|
|
||||||
# top_sub_dir_mask : bool = True):
|
|
||||||
|
|
||||||
def is_valid_directory_path(dirpath,select_dir_keywords):
|
def is_valid_directory_path(dirpath,select_dir_keywords):
|
||||||
|
|
||||||
activated_keywords = []
|
activated_keywords = []
|
||||||
@ -256,9 +248,12 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) of given a filesystem path.
|
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) of given a filesystem path.
|
||||||
When file and directory keywords are non-empty, the keywords enable filtering of directories and files that do not contain the specified keywords.
|
When file and directory keywords are non-empty, the keywords enable filtering of directory paths and file paths that do not contain the specified keywords.
|
||||||
|
|
||||||
In the .h5 file, only files that are admissible file formats will be stored in the form of datasets and attributes.
|
The data integration capabilities are limited by our file reader, which can only access data from a list of admissible file formats.
|
||||||
|
These however can be extended.
|
||||||
|
Directories are groups in the resultsing hdf5 file.
|
||||||
|
Files are formatted as composite object consisting of a group, file, and attributes.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
|
|
||||||
@ -317,26 +312,28 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
# (directory_path, suitable files) relationships in a dictionary.
|
# (directory_path, suitable files) relationships in a dictionary.
|
||||||
file_paths_dict = {}
|
file_paths_dict = {}
|
||||||
|
|
||||||
check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
|
#check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
|
||||||
|
check_file_ext = lambda filename: os.path.splitext(filename)[1] in admissible_file_ext_list
|
||||||
|
|
||||||
for dirpath, _, filenames in os.walk(item,topdown=False):
|
for dirpath, _, filenames in os.walk(item,topdown=False):
|
||||||
file_paths_dict[dirpath] = []
|
file_paths_dict[dirpath] = []
|
||||||
|
|
||||||
# Check files that have an admissible extension and store them in admissible_filenames list
|
# Keep files that have an admissible extension and store them in admissible_filenames list
|
||||||
admissible_filenames = []
|
admissible_filenames = []
|
||||||
for fn in filenames:
|
for fn in filenames:
|
||||||
if check_file_ext(fn):
|
if check_file_ext(fn):
|
||||||
admissible_filenames.append(fn)
|
admissible_filenames.append(fn)
|
||||||
|
|
||||||
if select_file_keywords: # when select_file_keywords = [], all files are considered
|
if select_file_keywords: # when select_file_keywords = [], all files are considered
|
||||||
for filename in admissible_filenames:
|
#for filename in admissible_filenames:
|
||||||
# Do not consider files with types for which there is still no file_reader. TODO: extend file_reader library.
|
for i in range(len(admissible_filenames) - 1, -1, -1):
|
||||||
#if not any([ext in filename for ext in admissible_file_ext_list]):
|
filename = admissible_filenames[i]
|
||||||
# continue
|
|
||||||
|
|
||||||
# Add files with name, that contains any of the file_keywords
|
# Remove files that with filename, not adhering to file keyword constraints.
|
||||||
if any([keyword in filename for keyword in select_file_keywords]):
|
if not any(keyword in filename for keyword in select_file_keywords):
|
||||||
file_paths_dict[dirpath].append(filename)
|
admissible_filenames.pop(i)
|
||||||
|
|
||||||
|
file_paths_dict[dirpath] = admissible_filenames
|
||||||
else:
|
else:
|
||||||
file_paths_dict[dirpath] = admissible_filenames
|
file_paths_dict[dirpath] = admissible_filenames
|
||||||
|
|
||||||
@ -344,21 +341,6 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
|
|
||||||
dirpath, dirnames, filenames_list = node
|
dirpath, dirnames, filenames_list = node
|
||||||
|
|
||||||
#if node_number == 0:
|
|
||||||
# offset = dirpath.count(os.sep)
|
|
||||||
|
|
||||||
# Filter out files with filenames not containing a keyword specified in the parameter 'select_file_keywords'.
|
|
||||||
# When select_file_keywords is an empty, i.e., [], do not apply any filter on the filenames.
|
|
||||||
|
|
||||||
|
|
||||||
#filtered_filename_list = []
|
|
||||||
#if select_file_keywords:
|
|
||||||
# for filename in filenames_list:
|
|
||||||
# if any([keyword in filename for keyword in select_file_keywords]):
|
|
||||||
# filtered_filename_list.append(filename)
|
|
||||||
#else:
|
|
||||||
# filtered_filename_list = filenames_list.copy()
|
|
||||||
|
|
||||||
filtered_filename_list = file_paths_dict.get(dirpath,filenames_list.copy())
|
filtered_filename_list = file_paths_dict.get(dirpath,filenames_list.copy())
|
||||||
|
|
||||||
|
|
||||||
@ -389,9 +371,6 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
print(group_name,' was already created.')
|
print(group_name,' was already created.')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)
|
|
||||||
|
|
||||||
for filenumber, filename in enumerate(filtered_filename_list):
|
for filenumber, filename in enumerate(filtered_filename_list):
|
||||||
|
|
||||||
# Get file extension (or file type)
|
# Get file extension (or file type)
|
||||||
|
Reference in New Issue
Block a user