Implemented a two important changes. 1. filename of output file is not passed as input but it is automatically computed based on an input config_param dict. 2) input filenames in file system path are now filtered on an initial walk through the directory tree. This is to use stored path filenames for prunning directory tree, later on.

This commit is contained in:
2024-04-02 17:33:58 +02:00
parent 9c70fd643f
commit 39cae66936

View File

@@ -17,6 +17,14 @@ import h5py
import yaml
def progressBar(count_value, total, suffix=''):
bar_length = 100
filled_up_Length = int(round(bar_length* count_value / float(total)))
percentage = round(100.0 * count_value/float(total),1)
bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
sys.stdout.flush()
def read_mtable_as_dataframe(filename):
""" Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input .h5 file
@@ -204,15 +212,20 @@ def annotate_root_dir(filename,annotation_dict: dict):
import shutil
def create_hdf5_file_from_filesystem_path(ofilename : str,
def create_hdf5_file_from_filesystem_path(config_param : dict ,
input_file_system_path : str,
select_dir_keywords = [],
select_file_keywords =[],
top_sub_dir_mask : bool = True):
#def create_hdf5_file_from_filesystem_path(output_filename : str,
# input_file_system_path : str,
# select_dir_keywords = [],
# select_file_keywords =[],
# top_sub_dir_mask : bool = True):
"""
Creates an .h5 file with name ofilename that preserves the directory tree (or folder structure) of given a filesystem path and
a few file and directory keywords. The keywords enable filtering of directories and files that do not contain the specified keywords.
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) of given a filesystem path.
When file and directory keywords are non-empty, the keywords enable filtering of directories and files that do not contain the specified keywords.
In the .h5 file, only files that are admissible file formats will be stored in the form of datasets and attributes.
@@ -237,6 +250,15 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
# Ensure OS compliant paths and keywords
# TODO: validate config_param dict, make sure output_filename is a valid file_path
group_id = config_param['group_id']
user_initials = config_param['user_initials']
created_at = config_file.created_at()
output_dir = config_param['output_dir']
output_filename = output_dir + config_file.output_filename_tempate(group_id,created_at,user_initials)
admissible_file_ext_list = list(config_file.select_file_readers(group_id).keys())
if '/' in input_file_system_path:
input_file_system_path = input_file_system_path.replace('/',os.sep)
else:
@@ -246,7 +268,7 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
select_dir_keywords[i] = keyword.replace('/',os.sep)
with h5py.File(ofilename, 'w') as h5file:
with h5py.File(output_filename, 'w') as h5file:
# Visit each subdirectory from top to bottom, root directory defined by input_file_sytem_path to the lower
# level directories.
@@ -266,6 +288,26 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
root_dir = input_file_system_path
# Create dictionary with directory-files pairs where files satisfy keyword and admisible type contraints
# It requires an extra pass over directory three and additional memory for dictionary, but it may be useful
# to speed up subsequent step and prune resulting directory tree.
file_paths_dict = {}
if select_file_keywords:
for dirpath, _, filenames_list in os.walk(item,topdown=False):
file_paths_dict[dirpath] = []
for filename in filenames_list:
if not any([ext in filename for ext in admissible_file_ext_list]):
continue
if any([keyword in filename for keyword in select_file_keywords]):
file_paths_dict[dirpath].append(filename)
#admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys())
#for filename in filtered_filename_list.copy():
# if not any([ext in filename for ext in admissible_file_ext_list]):
# filtered_filename_list.remove(filename)
for node_number, node in enumerate(os.walk(item, topdown=True)):
dirpath, dirnames, filenames_list = node
@@ -277,24 +319,22 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
# When select_file_keywords is an empty, i.e., [], do not apply any filter on the filenames.
filtered_filename_list = []
if select_file_keywords:
for filename in filenames_list:
if any([keyword in filename for keyword in select_file_keywords]):
filtered_filename_list.append(filename)
else:
filtered_filename_list = filenames_list.copy()
#filtered_filename_list = []
#if select_file_keywords:
# for filename in filenames_list:
# if any([keyword in filename for keyword in select_file_keywords]):
# filtered_filename_list.append(filename)
#else:
# filtered_filename_list = filenames_list.copy()
admissible_file_ext_list = list(config_file.ext_to_reader_dict.keys())
for filename in filtered_filename_list.copy():
if not any([ext in filename for ext in admissible_file_ext_list]):
filtered_filename_list.remove(filename)
filtered_filename_list = file_paths_dict.get(dirpath,filenames_list.copy())
# Skip subdirectories that do not contain a keyword in the parameter 'select_dir_keywords' when it is nonempty
if select_dir_keywords:
#if (dirpath.count(os.sep) > offset) and not any([item in dirpath for item in select_dir_keywords]):
#tail, dirname = os.path.split(dirpath)
#if not any([item in dirname for item in select_dir_keywords]):
if not any([item in dirpath for item in select_dir_keywords]):
continue
@@ -308,18 +348,21 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
# TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)
for filename in filtered_filename_list:
for filenumber, filename in enumerate(filtered_filename_list):
# Get file extension (or file type)
file_name, file_ext = os.path.splitext(filename)
#print(filename)
#try:
if not 'h5' in filename:
file_dict = config_file.ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
file_dict = config_file.select_file_readers(group_id)[file_ext](os.path.join(dirpath,filename))
if not file_dict:
continue
try:
# Create group and add their attributes
h5file[group_name].create_group(name=file_dict['name'])
for key in file_dict['attributes_dict'].keys():
@@ -343,11 +386,24 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
#dtype = file_dict['dtype'],
shape = dataset['shape'])
except Exception as inst:
# TODO: log when a file could not be stored as a dataset
print(inst)
else:
config_file.ext_to_reader_dict[file_ext](source_file_path = os.path.join(dirpath,filename),
config_file.select_file_readers(group_id)[file_ext](source_file_path = os.path.join(dirpath,filename),
dest_file_obj = h5file,
dest_group_name = group_name +'/'+filename)
print(file_ext, ':)')
#print(filename,file_ext, ':)')
progressBar(filenumber,len(filtered_filename_list), 'Uploading files in ' + dirpath)
output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename)
return output_filename, output_yml_filename_path