Synch with remote repo

2025-02-03 10:31:48 +01:00
parent a3ccff4079
commit 32bba4239a
102 changed files with 19584 additions and 19584 deletions
--- a/utils/g5505_utils.py
+++ b/utils/g5505_utils.py
@@ -1,403 +1,403 @@
-import pandas as pd
-import os
-import sys
-import shutil
-import datetime
-import logging
-import numpy as np
-import h5py
-import re
-
-
-def setup_logging(log_dir, log_filename):
-    """Sets up logging to a specified directory and file.
-
-    Parameters:
-        log_dir (str): Directory to save the log file.
-        log_filename (str): Name of the log file.
-    """
-    # Ensure the log directory exists
-    os.makedirs(log_dir, exist_ok=True)
-    
-    # Create a logger instance
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-    
-    # Create a file handler
-    log_path = os.path.join(log_dir, log_filename)
-    file_handler = logging.FileHandler(log_path)
-    
-    # Create a formatter and set it for the handler
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-    file_handler.setFormatter(formatter)
-    
-    # Add the handler to the logger
-    logger.addHandler(file_handler)
-
-
-def is_callable_list(x : list):
-    return all([callable(item) for item in x])
-
-def is_str_list(x : list):
-    return all([isinstance(item,str) for item in x])
-
-def augment_with_filetype(df):
-    df['filetype'] = [os.path.splitext(item)[1][1::] for item in df['filename']]
-    #return [os.path.splitext(item)[1][1::] for item in df['filename']]
-    return df
-
-def augment_with_filenumber(df):  
-    df['filenumber'] = [item[0:item.find('_')] for item in df['filename']]
-    #return [item[0:item.find('_')] for item in df['filename']]
-    return df
-
-def group_by_df_column(df, column_name: str):
-    """
-    df (pandas.DataFrame): 
-    column_name (str): column_name of df by which grouping operation will take place.  
-    """
-
-    if not column_name in df.columns:
-        raise ValueError("column_name must be in the columns of df.")
-    
-    return df[column_name]
-
-def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame):
-    
-    sample_name = []
-    sample_quality = []
-    for item in input_data['sample']:
-        if item.find('(')!=-1:
-            #print(item)
-            sample_name.append(item[0:item.find('(')])
-            sample_quality.append(item[item.find('(')+1:len(item)-1])
-        else:            
-            if item=='':
-                sample_name.append('Not yet annotated')
-                sample_quality.append('unevaluated')
-            else:
-                sample_name.append(item)
-                sample_quality.append('good data')
-    input_data['sample'] = sample_name
-    input_data['data_quality'] = sample_quality
-
-    return input_data
-
-def make_file_copy(source_file_path, output_folder_name : str = 'tmp_files'):
-
-    pathtail, filename = os.path.split(source_file_path)
-    #backup_filename = 'backup_'+ filename
-    backup_filename =  filename
-    # Path                     
-    ROOT_DIR = os.path.abspath(os.curdir)
-
-    tmp_dirpath = os.path.join(ROOT_DIR,output_folder_name)
-    if not os.path.exists(tmp_dirpath):
-        os.mkdir(tmp_dirpath)
-
-    tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
-    shutil.copy(source_file_path, tmp_file_path)
-
-    return tmp_file_path
-
-def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'):
-    now = datetime.datetime.now()
-    # Populate now object with time zone information obtained from the local system
-    now_tz_aware = now.astimezone()
-    tz = now_tz_aware.strftime('%z')
-    # Replace colons in the time part of the timestamp with hyphens to make it file name friendly
-    created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz
-    return created_at
-
-def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    # Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format)
-    datetime_cols = df.select_dtypes(include=['datetime']).columns
-    for col in datetime_cols:
-        # Convert datetime to string in the specified format, handling NaT
-        df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
-
-    # Handle object columns with mixed types
-    otype_cols = df.select_dtypes(include='O')
-    for col in otype_cols:
-        col_data = df[col]
-
-        # Check if all elements in the column are strings
-        if col_data.apply(lambda x: isinstance(x, str)).all():
-            df[col] = df[col].astype(str)
-        else:
-            # If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN
-            df[col] = pd.to_numeric(col_data, errors='coerce')
-
-        # Handle NaN values differently based on dtype
-        if pd.api.types.is_string_dtype(df[col]):
-            # Replace NaN in string columns with empty string
-            df[col] = df[col].fillna('')  # Replace NaN with empty string
-        elif pd.api.types.is_numeric_dtype(df[col]):
-            # For numeric columns, we want to keep NaN as it is
-            # But if integer column has NaN, consider casting to float
-            if pd.api.types.is_integer_dtype(df[col]):
-                df[col] = df[col].astype(float)  # Cast to float to allow NaN
-            else:
-                df[col] = df[col].fillna(np.nan)  # Keep NaN in float columns
-
-    return df
-
-def convert_dataframe_to_np_structured_array(df: pd.DataFrame):
-
-    df = sanitize_dataframe(df)
-     # Define the dtype for the structured array, ensuring compatibility with h5py
-    dtype = []
-    for col in df.columns:
-
-        col_data = df[col]
-        col_dtype = col_data.dtype
-
-        try:
-            if pd.api.types.is_string_dtype(col_dtype):
-                # Convert string dtype to fixed-length strings
-                max_len = col_data.str.len().max() if not col_data.isnull().all() else 0
-                dtype.append((col, f'S{max_len}'))
-            elif pd.api.types.is_integer_dtype(col_dtype):
-                dtype.append((col, 'i4'))  # Assuming 32-bit integer
-            elif pd.api.types.is_float_dtype(col_dtype):
-                dtype.append((col, 'f4'))  # Assuming 32-bit float
-            else:
-                # Handle unsupported data types
-                print(f"Unsupported dtype found in column '{col}': {col_data.dtype}")
-                raise ValueError(f"Unsupported data type: {col_data.dtype}")
-            
-        except Exception as e:
-            # Log more detailed error message
-            print(f"Error processing column '{col}': {e}")
-            raise
-
-    # Convert the DataFrame to a structured array
-    structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
-
-    return structured_array
-
-def convert_string_to_bytes(input_list: list):
-    """Convert a list of strings into a numpy array with utf8-type entries.
-
-    Parameters
-    ----------
-    input_list (list) : list of string objects
-
-    Returns
-    -------
-    input_array_bytes (ndarray): array of ut8-type entries.
-    """
-    utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
-    if input_list:
-        max_length = max(len(item) for item in input_list)
-        # Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
-        input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
-        input_array_bytes  = np.array(input_list_bytes,dtype=utf8_type(max_length)) 
-    else:
-        input_array_bytes  = np.array([],dtype=utf8_type(0))
-
-    return input_array_bytes
-
-def convert_attrdict_to_np_structured_array(attr_value: dict):
-    """
-    Converts a dictionary of attributes into a numpy structured array for HDF5 
-    compound type compatibility.
-
-    Each dictionary key is mapped to a field in the structured array, with the 
-    data type (S) determined by the longest string representation of the values. 
-    If the dictionary is empty, the function returns 'missing'.
-
-    Parameters
-    ----------
-    attr_value : dict
-        Dictionary containing the attributes to be converted. Example:
-        attr_value = {
-            'name': 'Temperature',
-            'unit': 'Celsius',
-            'value': 23.5,
-            'timestamp': '2023-09-26 10:00'
-        }
-
-    Returns
-    -------
-    new_attr_value : ndarray or str
-        Numpy structured array with UTF-8 encoded fields. Returns 'missing' if 
-        the input dictionary is empty.
-    """
-    dtype = []
-    values_list = []
-    max_length = max(len(str(attr_value[key])) for key in attr_value.keys())
-    for key in attr_value.keys():
-        if key != 'rename_as':
-            dtype.append((key, f'S{max_length}'))
-            values_list.append(attr_value[key])  
-    if values_list:
-        new_attr_value = np.array([tuple(values_list)], dtype=dtype)
-    else:
-        new_attr_value = 'missing'
-
-    return new_attr_value
-
-
-def infer_units(column_name):
-    # TODO: complete or remove
-
-    match = re.search('\[.+\]')
-
-    if match:
-        return match
-    else:
-        match = re.search('\(.+\)')
-        
-    return match
-
-def progressBar(count_value, total, suffix=''):
-    bar_length = 100
-    filled_up_Length = int(round(bar_length* count_value / float(total)))
-    percentage = round(100.0 * count_value/float(total),1)
-    bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
-    sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
-    sys.stdout.flush()
-
-def copy_directory_with_contraints(input_dir_path, output_dir_path, 
-                                   select_dir_keywords = None, 
-                                   select_file_keywords = None, 
-                                   allowed_file_extensions = None,
-                                   dry_run = False):
-    """
-        Copies files from input_dir_path to output_dir_path based on specified constraints.
-
-        Parameters
-        ----------
-            input_dir_path (str): Path to the input directory.
-            output_dir_path (str): Path to the output directory.
-            select_dir_keywords (list): optional, List of keywords for selecting directories.
-            select_file_keywords (list): optional, List of keywords for selecting files.
-            allowed_file_extensions (list): optional, List of allowed file extensions.
-
-        Returns
-        -------
-            path_to_files_dict (dict): dictionary mapping directory paths to lists of copied file names satisfying the constraints.
-    """
-
-    # Unconstrained default behavior: No filters, make sure variable are lists even when defined as None in function signature
-    select_dir_keywords = select_dir_keywords or []
-    select_file_keywords = select_file_keywords or []
-    allowed_file_extensions = allowed_file_extensions or []
-
-    date = created_at('%Y_%m').replace(":", "-")
-    log_dir='logs/'
-    setup_logging(log_dir, f"copy_directory_with_contraints_{date}.log")
-
-    # Define helper functions. Return by default true when filtering lists are either None or []
-    def has_allowed_extension(filename):
-        return not allowed_file_extensions or os.path.splitext(filename)[1] in allowed_file_extensions
-
-    def file_is_selected(filename):
-        return not select_file_keywords or any(keyword in filename for keyword in select_file_keywords)
-
-    
-    # Collect paths of directories, which are directly connected to the root dir and match select_dir_keywords
-    paths = []
-    if select_dir_keywords:
-        for item in os.listdir(input_dir_path): #Path(input_dir_path).iterdir():
-            if any([item in keyword for keyword in select_dir_keywords]):
-                paths.append(os.path.join(input_dir_path,item))
-    else:
-        paths.append(input_dir_path) #paths.append(Path(input_dir_path))
-
-
-    path_to_files_dict  = {} # Dictionary to store directory-file pairs satisfying constraints
-
-    for subpath in paths:
-
-        for dirpath, _, filenames in os.walk(subpath,topdown=False):
-
-            # Reduce filenames to those that are admissible
-            admissible_filenames = [
-                filename for filename in filenames 
-                if file_is_selected(filename) and has_allowed_extension(filename)
-            ]
-
-            if admissible_filenames:  # Only create directory if there are files to copy
-                
-                relative_dirpath = os.path.relpath(dirpath, input_dir_path)
-                target_dirpath = os.path.join(output_dir_path, relative_dirpath)
-                path_to_files_dict[target_dirpath] = admissible_filenames
-
-                if not dry_run:
-
-                    # Perform the actual copying
-
-                    os.makedirs(target_dirpath, exist_ok=True)
-                    
-                    for filename in admissible_filenames:
-                        src_file_path = os.path.join(dirpath, filename)
-                        dest_file_path = os.path.join(target_dirpath, filename)
-                        try:
-                            shutil.copy2(src_file_path, dest_file_path)
-                        except Exception as e:
-                            logging.error("Failed to copy %s: %s", src_file_path, e)
-
-    return path_to_files_dict 
-
-def to_serializable_dtype(value):
-
-    """Transform value's dtype into  YAML/JSON compatible dtype
-
-    Parameters
-    ----------
-    value : _type_
-        _description_
-
-    Returns
-    -------
-    _type_
-        _description_
-    """
-    try:
-        if isinstance(value, np.generic):
-            if np.issubdtype(value.dtype, np.bytes_):
-                value = value.decode('utf-8')
-            elif np.issubdtype(value.dtype, np.unicode_):
-                value = str(value)
-            elif np.issubdtype(value.dtype, np.number):
-                value = float(value)
-            else:
-                print('Yaml-compatible data-type was not found. Value has been set to NaN.')
-                value = np.nan    
-        elif isinstance(value, np.ndarray):  
-            # Handling structured array types (with fields)
-            if value.dtype.names:
-                value = {field: to_serializable_dtype(value[field]) for field in value.dtype.names}
-            else:
-                # Handling regular array NumPy types with assumption of unform dtype accross array elements
-                # TODO: evaluate a more general way to check for individual dtypes                        
-                if isinstance(value[0], bytes):
-                    # Decode bytes
-                    value = [item.decode('utf-8') for item in value] if len(value) > 1 else value[0].decode('utf-8')
-                elif isinstance(value[0], str):
-                    # Already a string type
-                    value = [str(item) for item in value] if len(value) > 1 else str(value[0])
-                elif isinstance(value[0], int):
-                    # Integer type
-                    value = [int(item) for item in value] if len(value) > 1 else int(value[0])
-                elif isinstance(value[0], float):
-                    # Floating type
-                    value = [float(item) for item in value] if len(value) > 1 else float(value[0])
-                else:
-                    print('Yaml-compatible data-type was not found. Value has been set to NaN.')
-                    print("Debug: value.dtype is", value.dtype)
-                    value = np.nan
-
-    except Exception as e:
-        print(f'Error converting value: {e}. Value has been set to NaN.')
-        value = np.nan
-
-    return value
-
-def is_structured_array(attr_val):
-    if isinstance(attr_val,np.ndarray):
-        return True if attr_val.dtype.names is not None else False
-    else: 
+import pandas as pd
+import os
+import sys
+import shutil
+import datetime
+import logging
+import numpy as np
+import h5py
+import re
+
+
+def setup_logging(log_dir, log_filename):
+    """Sets up logging to a specified directory and file.
+
+    Parameters:
+        log_dir (str): Directory to save the log file.
+        log_filename (str): Name of the log file.
+    """
+    # Ensure the log directory exists
+    os.makedirs(log_dir, exist_ok=True)
+    
+    # Create a logger instance
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    
+    # Create a file handler
+    log_path = os.path.join(log_dir, log_filename)
+    file_handler = logging.FileHandler(log_path)
+    
+    # Create a formatter and set it for the handler
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    file_handler.setFormatter(formatter)
+    
+    # Add the handler to the logger
+    logger.addHandler(file_handler)
+
+
+def is_callable_list(x : list):
+    return all([callable(item) for item in x])
+
+def is_str_list(x : list):
+    return all([isinstance(item,str) for item in x])
+
+def augment_with_filetype(df):
+    df['filetype'] = [os.path.splitext(item)[1][1::] for item in df['filename']]
+    #return [os.path.splitext(item)[1][1::] for item in df['filename']]
+    return df
+
+def augment_with_filenumber(df):  
+    df['filenumber'] = [item[0:item.find('_')] for item in df['filename']]
+    #return [item[0:item.find('_')] for item in df['filename']]
+    return df
+
+def group_by_df_column(df, column_name: str):
+    """
+    df (pandas.DataFrame): 
+    column_name (str): column_name of df by which grouping operation will take place.  
+    """
+
+    if not column_name in df.columns:
+        raise ValueError("column_name must be in the columns of df.")
+    
+    return df[column_name]
+
+def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame):
+    
+    sample_name = []
+    sample_quality = []
+    for item in input_data['sample']:
+        if item.find('(')!=-1:
+            #print(item)
+            sample_name.append(item[0:item.find('(')])
+            sample_quality.append(item[item.find('(')+1:len(item)-1])
+        else:            
+            if item=='':
+                sample_name.append('Not yet annotated')
+                sample_quality.append('unevaluated')
+            else:
+                sample_name.append(item)
+                sample_quality.append('good data')
+    input_data['sample'] = sample_name
+    input_data['data_quality'] = sample_quality
+
+    return input_data
+
+def make_file_copy(source_file_path, output_folder_name : str = 'tmp_files'):
+
+    pathtail, filename = os.path.split(source_file_path)
+    #backup_filename = 'backup_'+ filename
+    backup_filename =  filename
+    # Path                     
+    ROOT_DIR = os.path.abspath(os.curdir)
+
+    tmp_dirpath = os.path.join(ROOT_DIR,output_folder_name)
+    if not os.path.exists(tmp_dirpath):
+        os.mkdir(tmp_dirpath)
+
+    tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
+    shutil.copy(source_file_path, tmp_file_path)
+
+    return tmp_file_path
+
+def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'):
+    now = datetime.datetime.now()
+    # Populate now object with time zone information obtained from the local system
+    now_tz_aware = now.astimezone()
+    tz = now_tz_aware.strftime('%z')
+    # Replace colons in the time part of the timestamp with hyphens to make it file name friendly
+    created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz
+    return created_at
+
+def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    # Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format)
+    datetime_cols = df.select_dtypes(include=['datetime']).columns
+    for col in datetime_cols:
+        # Convert datetime to string in the specified format, handling NaT
+        df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
+
+    # Handle object columns with mixed types
+    otype_cols = df.select_dtypes(include='O')
+    for col in otype_cols:
+        col_data = df[col]
+
+        # Check if all elements in the column are strings
+        if col_data.apply(lambda x: isinstance(x, str)).all():
+            df[col] = df[col].astype(str)
+        else:
+            # If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN
+            df[col] = pd.to_numeric(col_data, errors='coerce')
+
+        # Handle NaN values differently based on dtype
+        if pd.api.types.is_string_dtype(df[col]):
+            # Replace NaN in string columns with empty string
+            df[col] = df[col].fillna('')  # Replace NaN with empty string
+        elif pd.api.types.is_numeric_dtype(df[col]):
+            # For numeric columns, we want to keep NaN as it is
+            # But if integer column has NaN, consider casting to float
+            if pd.api.types.is_integer_dtype(df[col]):
+                df[col] = df[col].astype(float)  # Cast to float to allow NaN
+            else:
+                df[col] = df[col].fillna(np.nan)  # Keep NaN in float columns
+
+    return df
+
+def convert_dataframe_to_np_structured_array(df: pd.DataFrame):
+
+    df = sanitize_dataframe(df)
+     # Define the dtype for the structured array, ensuring compatibility with h5py
+    dtype = []
+    for col in df.columns:
+
+        col_data = df[col]
+        col_dtype = col_data.dtype
+
+        try:
+            if pd.api.types.is_string_dtype(col_dtype):
+                # Convert string dtype to fixed-length strings
+                max_len = col_data.str.len().max() if not col_data.isnull().all() else 0
+                dtype.append((col, f'S{max_len}'))
+            elif pd.api.types.is_integer_dtype(col_dtype):
+                dtype.append((col, 'i4'))  # Assuming 32-bit integer
+            elif pd.api.types.is_float_dtype(col_dtype):
+                dtype.append((col, 'f4'))  # Assuming 32-bit float
+            else:
+                # Handle unsupported data types
+                print(f"Unsupported dtype found in column '{col}': {col_data.dtype}")
+                raise ValueError(f"Unsupported data type: {col_data.dtype}")
+            
+        except Exception as e:
+            # Log more detailed error message
+            print(f"Error processing column '{col}': {e}")
+            raise
+
+    # Convert the DataFrame to a structured array
+    structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
+
+    return structured_array
+
+def convert_string_to_bytes(input_list: list):
+    """Convert a list of strings into a numpy array with utf8-type entries.
+
+    Parameters
+    ----------
+    input_list (list) : list of string objects
+
+    Returns
+    -------
+    input_array_bytes (ndarray): array of ut8-type entries.
+    """
+    utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
+    if input_list:
+        max_length = max(len(item) for item in input_list)
+        # Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
+        input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
+        input_array_bytes  = np.array(input_list_bytes,dtype=utf8_type(max_length)) 
+    else:
+        input_array_bytes  = np.array([],dtype=utf8_type(0))
+
+    return input_array_bytes
+
+def convert_attrdict_to_np_structured_array(attr_value: dict):
+    """
+    Converts a dictionary of attributes into a numpy structured array for HDF5 
+    compound type compatibility.
+
+    Each dictionary key is mapped to a field in the structured array, with the 
+    data type (S) determined by the longest string representation of the values. 
+    If the dictionary is empty, the function returns 'missing'.
+
+    Parameters
+    ----------
+    attr_value : dict
+        Dictionary containing the attributes to be converted. Example:
+        attr_value = {
+            'name': 'Temperature',
+            'unit': 'Celsius',
+            'value': 23.5,
+            'timestamp': '2023-09-26 10:00'
+        }
+
+    Returns
+    -------
+    new_attr_value : ndarray or str
+        Numpy structured array with UTF-8 encoded fields. Returns 'missing' if 
+        the input dictionary is empty.
+    """
+    dtype = []
+    values_list = []
+    max_length = max(len(str(attr_value[key])) for key in attr_value.keys())
+    for key in attr_value.keys():
+        if key != 'rename_as':
+            dtype.append((key, f'S{max_length}'))
+            values_list.append(attr_value[key])  
+    if values_list:
+        new_attr_value = np.array([tuple(values_list)], dtype=dtype)
+    else:
+        new_attr_value = 'missing'
+
+    return new_attr_value
+
+
+def infer_units(column_name):
+    # TODO: complete or remove
+
+    match = re.search('\[.+\]')
+
+    if match:
+        return match
+    else:
+        match = re.search('\(.+\)')
+        
+    return match
+
+def progressBar(count_value, total, suffix=''):
+    bar_length = 100
+    filled_up_Length = int(round(bar_length* count_value / float(total)))
+    percentage = round(100.0 * count_value/float(total),1)
+    bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
+    sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
+    sys.stdout.flush()
+
+def copy_directory_with_contraints(input_dir_path, output_dir_path, 
+                                   select_dir_keywords = None, 
+                                   select_file_keywords = None, 
+                                   allowed_file_extensions = None,
+                                   dry_run = False):
+    """
+        Copies files from input_dir_path to output_dir_path based on specified constraints.
+
+        Parameters
+        ----------
+            input_dir_path (str): Path to the input directory.
+            output_dir_path (str): Path to the output directory.
+            select_dir_keywords (list): optional, List of keywords for selecting directories.
+            select_file_keywords (list): optional, List of keywords for selecting files.
+            allowed_file_extensions (list): optional, List of allowed file extensions.
+
+        Returns
+        -------
+            path_to_files_dict (dict): dictionary mapping directory paths to lists of copied file names satisfying the constraints.
+    """
+
+    # Unconstrained default behavior: No filters, make sure variable are lists even when defined as None in function signature
+    select_dir_keywords = select_dir_keywords or []
+    select_file_keywords = select_file_keywords or []
+    allowed_file_extensions = allowed_file_extensions or []
+
+    date = created_at('%Y_%m').replace(":", "-")
+    log_dir='logs/'
+    setup_logging(log_dir, f"copy_directory_with_contraints_{date}.log")
+
+    # Define helper functions. Return by default true when filtering lists are either None or []
+    def has_allowed_extension(filename):
+        return not allowed_file_extensions or os.path.splitext(filename)[1] in allowed_file_extensions
+
+    def file_is_selected(filename):
+        return not select_file_keywords or any(keyword in filename for keyword in select_file_keywords)
+
+    
+    # Collect paths of directories, which are directly connected to the root dir and match select_dir_keywords
+    paths = []
+    if select_dir_keywords:
+        for item in os.listdir(input_dir_path): #Path(input_dir_path).iterdir():
+            if any([item in keyword for keyword in select_dir_keywords]):
+                paths.append(os.path.join(input_dir_path,item))
+    else:
+        paths.append(input_dir_path) #paths.append(Path(input_dir_path))
+
+
+    path_to_files_dict  = {} # Dictionary to store directory-file pairs satisfying constraints
+
+    for subpath in paths:
+
+        for dirpath, _, filenames in os.walk(subpath,topdown=False):
+
+            # Reduce filenames to those that are admissible
+            admissible_filenames = [
+                filename for filename in filenames 
+                if file_is_selected(filename) and has_allowed_extension(filename)
+            ]
+
+            if admissible_filenames:  # Only create directory if there are files to copy
+                
+                relative_dirpath = os.path.relpath(dirpath, input_dir_path)
+                target_dirpath = os.path.join(output_dir_path, relative_dirpath)
+                path_to_files_dict[target_dirpath] = admissible_filenames
+
+                if not dry_run:
+
+                    # Perform the actual copying
+
+                    os.makedirs(target_dirpath, exist_ok=True)
+                    
+                    for filename in admissible_filenames:
+                        src_file_path = os.path.join(dirpath, filename)
+                        dest_file_path = os.path.join(target_dirpath, filename)
+                        try:
+                            shutil.copy2(src_file_path, dest_file_path)
+                        except Exception as e:
+                            logging.error("Failed to copy %s: %s", src_file_path, e)
+
+    return path_to_files_dict 
+
+def to_serializable_dtype(value):
+
+    """Transform value's dtype into  YAML/JSON compatible dtype
+
+    Parameters
+    ----------
+    value : _type_
+        _description_
+
+    Returns
+    -------
+    _type_
+        _description_
+    """
+    try:
+        if isinstance(value, np.generic):
+            if np.issubdtype(value.dtype, np.bytes_):
+                value = value.decode('utf-8')
+            elif np.issubdtype(value.dtype, np.unicode_):
+                value = str(value)
+            elif np.issubdtype(value.dtype, np.number):
+                value = float(value)
+            else:
+                print('Yaml-compatible data-type was not found. Value has been set to NaN.')
+                value = np.nan    
+        elif isinstance(value, np.ndarray):  
+            # Handling structured array types (with fields)
+            if value.dtype.names:
+                value = {field: to_serializable_dtype(value[field]) for field in value.dtype.names}
+            else:
+                # Handling regular array NumPy types with assumption of unform dtype accross array elements
+                # TODO: evaluate a more general way to check for individual dtypes                        
+                if isinstance(value[0], bytes):
+                    # Decode bytes
+                    value = [item.decode('utf-8') for item in value] if len(value) > 1 else value[0].decode('utf-8')
+                elif isinstance(value[0], str):
+                    # Already a string type
+                    value = [str(item) for item in value] if len(value) > 1 else str(value[0])
+                elif isinstance(value[0], int):
+                    # Integer type
+                    value = [int(item) for item in value] if len(value) > 1 else int(value[0])
+                elif isinstance(value[0], float):
+                    # Floating type
+                    value = [float(item) for item in value] if len(value) > 1 else float(value[0])
+                else:
+                    print('Yaml-compatible data-type was not found. Value has been set to NaN.')
+                    print("Debug: value.dtype is", value.dtype)
+                    value = np.nan
+
+    except Exception as e:
+        print(f'Error converting value: {e}. Value has been set to NaN.')
+        value = np.nan
+
+    return value
+
+def is_structured_array(attr_val):
+    if isinstance(attr_val,np.ndarray):
+        return True if attr_val.dtype.names is not None else False
+    else: 
        return False