Major update. Remove file filtering option and outputname input arg. The output name is now the same as the path_to_input_dir + .h5. By default, the hdf5 writer preserves second level subdirectories and the rest are flattend. dir filtering is outsource to copy_dir_with_constraints from utils-

This commit is contained in:
2024-09-16 16:35:09 +02:00
parent 9c641c0dae
commit d63f522588

View File

@ -238,11 +238,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
if 'tmp_files' in tmp_file_path:
os.remove(tmp_file_path)
def create_hdf5_file_from_filesystem_path(output_filename : str,
path_to_input_directory: str,
select_dir_keywords = [],
select_file_keywords =[],
top_sub_dir_mask : bool = True,
def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
path_to_filenames_dict: dict = None,
select_dir_keywords : list = [],
root_metadata_dict : dict = {}):
"""
@ -257,17 +255,17 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
----------
output_filename : str
Name of the output HDF5 file.
path_to_input_directory: str
path_to_input_directory : str
Path to root directory, specified with forward slashes, e.g., path/to/root.
path_to_filenames_dict : dict, optional
A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
If provided, 'input_file_system_path' is ignored.
select_dir_keywords : list
List of string elements to consider or select only directory paths that contain
a word in 'select_dir_keywords'. When empty, all directory paths are considered
to be included in the HDF5 file group hierarchy.
select_file_keywords : list
List of string elements to consider or select only files that contain a word in
'select_file_keywords'. When empty, all files are considered to be stored in the HDF5 file.
top_sub_dir_mask : bool
Mask for top-level subdirectories.
root_metadata_dict : dict
Metadata to include at the root level of the HDF5 file.
@ -277,7 +275,9 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
Path to the created HDF5 file.
"""
allowed_file_extensions = filereader_registry.file_extensions
output_filename = path_to_input_directory + '.h5'
if '/' in path_to_input_directory:
path_to_input_directory= path_to_input_directory.replace('/',os.sep)
else:
@ -287,12 +287,12 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
select_dir_keywords[i] = keyword.replace('/',os.sep)
# Copy input_directory into the output_dir_path, and work with it from now on
output_dir_path = os.path.splitext(output_filename)[0].replace('/',os.sep)
path_to_filenames_dict = utils.copy_directory_with_contraints(path_to_input_directory,
output_dir_path,
select_dir_keywords,
select_file_keywords,
allowed_file_extensions)
output_dir_path = os.path.splitext(output_filename)[0].replace('/',os.sep)
if not path_to_filenames_dict:
path_to_filenames_dict = utils.copy_directory_with_contraints(path_to_input_directory,
output_dir_path,
dry_run=True)
# Set input_directory as copied input directory
root_dir = output_dir_path
@ -307,10 +307,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
# Print and log the start message
print(start_message)
logging.info(start_message)
# Check if dirpath is valid. TODO: This is perhaps redundant by design of path_to_filenames_dict.
if not is_valid_directory_path(dirpath,select_dir_keywords):
continue
# Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict.
if not filtered_filenames_list:
continue
@ -319,7 +316,10 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
# Flatten group name to one level
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
if select_dir_keywords:
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
else:
offset = 1
tmp_list = group_name.split('/')
if len(tmp_list) > offset+1:
group_name = '/'.join([tmp_list[i] for i in range(offset+1)])