Made a few optimizations to code and documentation. Expressions relying on list comprehensions were simplified with generator expressions. ex,: any([keyword in filename for keyword in select_file_keywords]) was simplified to any(keyword in filename for keyword in select_file_keywords).

2024-05-24 09:06:07 +02:00
parent d574ac382d
commit 1537633b1a
1 changed files with 16 additions and 37 deletions
--- a/src/hdf5_lib.py
+++ b/src/hdf5_lib.py
@ -225,14 +225,6 @@ def annotate_root_dir(filename,annotation_dict: dict):
            file.attrs.create('metadata_'+key, annotation_dict[key])

    
-import shutil
-
-#def create_hdf5_file_from_filesystem_path(config_param : dict , 
-#                                          input_file_system_path : str, 
-#                                          select_dir_keywords = [], 
-#                                          select_file_keywords =[], 
-#                                          top_sub_dir_mask : bool = True):
-
 def is_valid_directory_path(dirpath,select_dir_keywords):

    activated_keywords = []
@ -256,9 +248,12 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,

    """
    Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) of given a filesystem path.
-    When file and directory keywords are non-empty, the keywords enable filtering of directories and files that do not contain the specified keywords.
+    When file and directory keywords are non-empty, the keywords enable filtering of directory paths and file paths that do not contain the specified keywords.

-    In the .h5 file, only files that are admissible file formats will be stored in the form of datasets and attributes.
+    The data integration capabilities are limited by our file reader, which can only access data from a list of admissible file formats.
+    These however can be extended. 
+    Directories are groups in the resultsing hdf5 file.
+    Files are formatted as composite object consisting of a group, file, and attributes.

    Parameters:

@ -317,26 +312,28 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
            # (directory_path, suitable files) relationships in a dictionary. 
            file_paths_dict = {}  

-            check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
+            #check_file_ext = lambda filename: any([ext in filename for ext in admissible_file_ext_list])
+            check_file_ext = lambda filename: os.path.splitext(filename)[1] in admissible_file_ext_list

            for dirpath, _, filenames in os.walk(item,topdown=False):
                file_paths_dict[dirpath] = []

-                # Check files that have an admissible extension and store them in admissible_filenames list
+                # Keep files that have an admissible extension and store them in admissible_filenames list
                admissible_filenames = []
                for fn in filenames:
                    if check_file_ext(fn):
                        admissible_filenames.append(fn)

                if select_file_keywords:  # when select_file_keywords = [], all files are considered    
-                    for filename in admissible_filenames:                        
-                        # Do not consider files with types for which there is still no file_reader. TODO: extend file_reader library.
-                        #if not any([ext in filename for ext in admissible_file_ext_list]):
-                        #    continue
+                    #for filename in admissible_filenames:     
+                    for i in range(len(admissible_filenames) - 1, -1, -1):
+                        filename = admissible_filenames[i]                   

-                        # Add files with name, that contains any of the file_keywords
-                        if any([keyword in filename for keyword in select_file_keywords]):                        
-                                file_paths_dict[dirpath].append(filename)
+                        # Remove files that with filename, not adhering to file keyword constraints.
+                        if not any(keyword in filename for keyword in select_file_keywords):
+                            admissible_filenames.pop(i)
+
+                    file_paths_dict[dirpath] = admissible_filenames 
                else:
                    file_paths_dict[dirpath] = admissible_filenames 
    
@ -344,21 +341,6 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,

                dirpath, dirnames, filenames_list = node

-                #if node_number == 0:
-                #    offset = dirpath.count(os.sep)
-            
-                # Filter out files with filenames not containing a keyword specified in the parameter 'select_file_keywords'.
-                # When select_file_keywords is an empty, i.e., [], do not apply any filter on the filenames.
-                
-                
-                #filtered_filename_list = []
-                #if select_file_keywords:                
-                #    for filename in filenames_list:
-                #        if any([keyword in filename for keyword in select_file_keywords]):
-                #                filtered_filename_list.append(filename)
-                #else:
-                #    filtered_filename_list = filenames_list.copy()
-
                filtered_filename_list = file_paths_dict.get(dirpath,filenames_list.copy())


@ -389,9 +371,6 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
                    print(group_name,' was already created.') 


-
-                # TODO: for each "admissible" file in filenames, create an associated dataset in the corresponding group (subdirectory)  
-                
                for filenumber, filename in enumerate(filtered_filename_list):
                    
                    # Get file extension (or file type)