Major update. Remove file filtering option and outputname input arg. The output name is now the same as the path_to_input_dir + .h5. By default, the hdf5 writer preserves second level subdirectories and the rest are flattend. dir filtering is outsource to copy_dir_with_constraints from utils-
This commit is contained in:
@ -238,11 +238,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
|
|||||||
if 'tmp_files' in tmp_file_path:
|
if 'tmp_files' in tmp_file_path:
|
||||||
os.remove(tmp_file_path)
|
os.remove(tmp_file_path)
|
||||||
|
|
||||||
def create_hdf5_file_from_filesystem_path(output_filename : str,
|
def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
|
||||||
path_to_input_directory: str,
|
path_to_filenames_dict: dict = None,
|
||||||
select_dir_keywords = [],
|
select_dir_keywords : list = [],
|
||||||
select_file_keywords =[],
|
|
||||||
top_sub_dir_mask : bool = True,
|
|
||||||
root_metadata_dict : dict = {}):
|
root_metadata_dict : dict = {}):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -257,17 +255,17 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
----------
|
----------
|
||||||
output_filename : str
|
output_filename : str
|
||||||
Name of the output HDF5 file.
|
Name of the output HDF5 file.
|
||||||
path_to_input_directory: str
|
path_to_input_directory : str
|
||||||
Path to root directory, specified with forward slashes, e.g., path/to/root.
|
Path to root directory, specified with forward slashes, e.g., path/to/root.
|
||||||
|
|
||||||
|
path_to_filenames_dict : dict, optional
|
||||||
|
A pre-processed dictionary where keys are directory paths on the input directory's tree and values are lists of files.
|
||||||
|
If provided, 'input_file_system_path' is ignored.
|
||||||
|
|
||||||
select_dir_keywords : list
|
select_dir_keywords : list
|
||||||
List of string elements to consider or select only directory paths that contain
|
List of string elements to consider or select only directory paths that contain
|
||||||
a word in 'select_dir_keywords'. When empty, all directory paths are considered
|
a word in 'select_dir_keywords'. When empty, all directory paths are considered
|
||||||
to be included in the HDF5 file group hierarchy.
|
to be included in the HDF5 file group hierarchy.
|
||||||
select_file_keywords : list
|
|
||||||
List of string elements to consider or select only files that contain a word in
|
|
||||||
'select_file_keywords'. When empty, all files are considered to be stored in the HDF5 file.
|
|
||||||
top_sub_dir_mask : bool
|
|
||||||
Mask for top-level subdirectories.
|
|
||||||
root_metadata_dict : dict
|
root_metadata_dict : dict
|
||||||
Metadata to include at the root level of the HDF5 file.
|
Metadata to include at the root level of the HDF5 file.
|
||||||
|
|
||||||
@ -277,7 +275,9 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
Path to the created HDF5 file.
|
Path to the created HDF5 file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
allowed_file_extensions = filereader_registry.file_extensions
|
output_filename = path_to_input_directory + '.h5'
|
||||||
|
|
||||||
|
|
||||||
if '/' in path_to_input_directory:
|
if '/' in path_to_input_directory:
|
||||||
path_to_input_directory= path_to_input_directory.replace('/',os.sep)
|
path_to_input_directory= path_to_input_directory.replace('/',os.sep)
|
||||||
else:
|
else:
|
||||||
@ -287,12 +287,12 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
select_dir_keywords[i] = keyword.replace('/',os.sep)
|
select_dir_keywords[i] = keyword.replace('/',os.sep)
|
||||||
|
|
||||||
# Copy input_directory into the output_dir_path, and work with it from now on
|
# Copy input_directory into the output_dir_path, and work with it from now on
|
||||||
output_dir_path = os.path.splitext(output_filename)[0].replace('/',os.sep)
|
output_dir_path = os.path.splitext(output_filename)[0].replace('/',os.sep)
|
||||||
path_to_filenames_dict = utils.copy_directory_with_contraints(path_to_input_directory,
|
|
||||||
output_dir_path,
|
if not path_to_filenames_dict:
|
||||||
select_dir_keywords,
|
path_to_filenames_dict = utils.copy_directory_with_contraints(path_to_input_directory,
|
||||||
select_file_keywords,
|
output_dir_path,
|
||||||
allowed_file_extensions)
|
dry_run=True)
|
||||||
# Set input_directory as copied input directory
|
# Set input_directory as copied input directory
|
||||||
root_dir = output_dir_path
|
root_dir = output_dir_path
|
||||||
|
|
||||||
@ -307,10 +307,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
# Print and log the start message
|
# Print and log the start message
|
||||||
print(start_message)
|
print(start_message)
|
||||||
logging.info(start_message)
|
logging.info(start_message)
|
||||||
|
|
||||||
# Check if dirpath is valid. TODO: This is perhaps redundant by design of path_to_filenames_dict.
|
|
||||||
if not is_valid_directory_path(dirpath,select_dir_keywords):
|
|
||||||
continue
|
|
||||||
# Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict.
|
# Check if filtered_filenames_list is nonempty. TODO: This is perhaps redundant by design of path_to_filenames_dict.
|
||||||
if not filtered_filenames_list:
|
if not filtered_filenames_list:
|
||||||
continue
|
continue
|
||||||
@ -319,7 +316,10 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
|
group_name = group_name.replace(root_dir.replace(os.sep,'/') + '/', '/')
|
||||||
|
|
||||||
# Flatten group name to one level
|
# Flatten group name to one level
|
||||||
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
|
if select_dir_keywords:
|
||||||
|
offset = sum([len(i.split(os.sep)) if i in dirpath else 0 for i in select_dir_keywords])
|
||||||
|
else:
|
||||||
|
offset = 1
|
||||||
tmp_list = group_name.split('/')
|
tmp_list = group_name.split('/')
|
||||||
if len(tmp_list) > offset+1:
|
if len(tmp_list) > offset+1:
|
||||||
group_name = '/'.join([tmp_list[i] for i in range(offset+1)])
|
group_name = '/'.join([tmp_list[i] for i in range(offset+1)])
|
||||||
|
Reference in New Issue
Block a user