Synch with remote repo
This commit is contained in:
@ -1,403 +1,403 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import datetime
|
||||
import logging
|
||||
import numpy as np
|
||||
import h5py
|
||||
import re
|
||||
|
||||
|
||||
def setup_logging(log_dir, log_filename):
|
||||
"""Sets up logging to a specified directory and file.
|
||||
|
||||
Parameters:
|
||||
log_dir (str): Directory to save the log file.
|
||||
log_filename (str): Name of the log file.
|
||||
"""
|
||||
# Ensure the log directory exists
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
|
||||
# Create a logger instance
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Create a file handler
|
||||
log_path = os.path.join(log_dir, log_filename)
|
||||
file_handler = logging.FileHandler(log_path)
|
||||
|
||||
# Create a formatter and set it for the handler
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# Add the handler to the logger
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
|
||||
def is_callable_list(x : list):
|
||||
return all([callable(item) for item in x])
|
||||
|
||||
def is_str_list(x : list):
|
||||
return all([isinstance(item,str) for item in x])
|
||||
|
||||
def augment_with_filetype(df):
|
||||
df['filetype'] = [os.path.splitext(item)[1][1::] for item in df['filename']]
|
||||
#return [os.path.splitext(item)[1][1::] for item in df['filename']]
|
||||
return df
|
||||
|
||||
def augment_with_filenumber(df):
|
||||
df['filenumber'] = [item[0:item.find('_')] for item in df['filename']]
|
||||
#return [item[0:item.find('_')] for item in df['filename']]
|
||||
return df
|
||||
|
||||
def group_by_df_column(df, column_name: str):
|
||||
"""
|
||||
df (pandas.DataFrame):
|
||||
column_name (str): column_name of df by which grouping operation will take place.
|
||||
"""
|
||||
|
||||
if not column_name in df.columns:
|
||||
raise ValueError("column_name must be in the columns of df.")
|
||||
|
||||
return df[column_name]
|
||||
|
||||
def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame):
|
||||
|
||||
sample_name = []
|
||||
sample_quality = []
|
||||
for item in input_data['sample']:
|
||||
if item.find('(')!=-1:
|
||||
#print(item)
|
||||
sample_name.append(item[0:item.find('(')])
|
||||
sample_quality.append(item[item.find('(')+1:len(item)-1])
|
||||
else:
|
||||
if item=='':
|
||||
sample_name.append('Not yet annotated')
|
||||
sample_quality.append('unevaluated')
|
||||
else:
|
||||
sample_name.append(item)
|
||||
sample_quality.append('good data')
|
||||
input_data['sample'] = sample_name
|
||||
input_data['data_quality'] = sample_quality
|
||||
|
||||
return input_data
|
||||
|
||||
def make_file_copy(source_file_path, output_folder_name : str = 'tmp_files'):
|
||||
|
||||
pathtail, filename = os.path.split(source_file_path)
|
||||
#backup_filename = 'backup_'+ filename
|
||||
backup_filename = filename
|
||||
# Path
|
||||
ROOT_DIR = os.path.abspath(os.curdir)
|
||||
|
||||
tmp_dirpath = os.path.join(ROOT_DIR,output_folder_name)
|
||||
if not os.path.exists(tmp_dirpath):
|
||||
os.mkdir(tmp_dirpath)
|
||||
|
||||
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
|
||||
shutil.copy(source_file_path, tmp_file_path)
|
||||
|
||||
return tmp_file_path
|
||||
|
||||
def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'):
|
||||
now = datetime.datetime.now()
|
||||
# Populate now object with time zone information obtained from the local system
|
||||
now_tz_aware = now.astimezone()
|
||||
tz = now_tz_aware.strftime('%z')
|
||||
# Replace colons in the time part of the timestamp with hyphens to make it file name friendly
|
||||
created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz
|
||||
return created_at
|
||||
|
||||
def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format)
|
||||
datetime_cols = df.select_dtypes(include=['datetime']).columns
|
||||
for col in datetime_cols:
|
||||
# Convert datetime to string in the specified format, handling NaT
|
||||
df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
|
||||
|
||||
# Handle object columns with mixed types
|
||||
otype_cols = df.select_dtypes(include='O')
|
||||
for col in otype_cols:
|
||||
col_data = df[col]
|
||||
|
||||
# Check if all elements in the column are strings
|
||||
if col_data.apply(lambda x: isinstance(x, str)).all():
|
||||
df[col] = df[col].astype(str)
|
||||
else:
|
||||
# If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN
|
||||
df[col] = pd.to_numeric(col_data, errors='coerce')
|
||||
|
||||
# Handle NaN values differently based on dtype
|
||||
if pd.api.types.is_string_dtype(df[col]):
|
||||
# Replace NaN in string columns with empty string
|
||||
df[col] = df[col].fillna('') # Replace NaN with empty string
|
||||
elif pd.api.types.is_numeric_dtype(df[col]):
|
||||
# For numeric columns, we want to keep NaN as it is
|
||||
# But if integer column has NaN, consider casting to float
|
||||
if pd.api.types.is_integer_dtype(df[col]):
|
||||
df[col] = df[col].astype(float) # Cast to float to allow NaN
|
||||
else:
|
||||
df[col] = df[col].fillna(np.nan) # Keep NaN in float columns
|
||||
|
||||
return df
|
||||
|
||||
def convert_dataframe_to_np_structured_array(df: pd.DataFrame):
|
||||
|
||||
df = sanitize_dataframe(df)
|
||||
# Define the dtype for the structured array, ensuring compatibility with h5py
|
||||
dtype = []
|
||||
for col in df.columns:
|
||||
|
||||
col_data = df[col]
|
||||
col_dtype = col_data.dtype
|
||||
|
||||
try:
|
||||
if pd.api.types.is_string_dtype(col_dtype):
|
||||
# Convert string dtype to fixed-length strings
|
||||
max_len = col_data.str.len().max() if not col_data.isnull().all() else 0
|
||||
dtype.append((col, f'S{max_len}'))
|
||||
elif pd.api.types.is_integer_dtype(col_dtype):
|
||||
dtype.append((col, 'i4')) # Assuming 32-bit integer
|
||||
elif pd.api.types.is_float_dtype(col_dtype):
|
||||
dtype.append((col, 'f4')) # Assuming 32-bit float
|
||||
else:
|
||||
# Handle unsupported data types
|
||||
print(f"Unsupported dtype found in column '{col}': {col_data.dtype}")
|
||||
raise ValueError(f"Unsupported data type: {col_data.dtype}")
|
||||
|
||||
except Exception as e:
|
||||
# Log more detailed error message
|
||||
print(f"Error processing column '{col}': {e}")
|
||||
raise
|
||||
|
||||
# Convert the DataFrame to a structured array
|
||||
structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
|
||||
|
||||
return structured_array
|
||||
|
||||
def convert_string_to_bytes(input_list: list):
|
||||
"""Convert a list of strings into a numpy array with utf8-type entries.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_list (list) : list of string objects
|
||||
|
||||
Returns
|
||||
-------
|
||||
input_array_bytes (ndarray): array of ut8-type entries.
|
||||
"""
|
||||
utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
|
||||
if input_list:
|
||||
max_length = max(len(item) for item in input_list)
|
||||
# Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
|
||||
input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
|
||||
input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length))
|
||||
else:
|
||||
input_array_bytes = np.array([],dtype=utf8_type(0))
|
||||
|
||||
return input_array_bytes
|
||||
|
||||
def convert_attrdict_to_np_structured_array(attr_value: dict):
|
||||
"""
|
||||
Converts a dictionary of attributes into a numpy structured array for HDF5
|
||||
compound type compatibility.
|
||||
|
||||
Each dictionary key is mapped to a field in the structured array, with the
|
||||
data type (S) determined by the longest string representation of the values.
|
||||
If the dictionary is empty, the function returns 'missing'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
attr_value : dict
|
||||
Dictionary containing the attributes to be converted. Example:
|
||||
attr_value = {
|
||||
'name': 'Temperature',
|
||||
'unit': 'Celsius',
|
||||
'value': 23.5,
|
||||
'timestamp': '2023-09-26 10:00'
|
||||
}
|
||||
|
||||
Returns
|
||||
-------
|
||||
new_attr_value : ndarray or str
|
||||
Numpy structured array with UTF-8 encoded fields. Returns 'missing' if
|
||||
the input dictionary is empty.
|
||||
"""
|
||||
dtype = []
|
||||
values_list = []
|
||||
max_length = max(len(str(attr_value[key])) for key in attr_value.keys())
|
||||
for key in attr_value.keys():
|
||||
if key != 'rename_as':
|
||||
dtype.append((key, f'S{max_length}'))
|
||||
values_list.append(attr_value[key])
|
||||
if values_list:
|
||||
new_attr_value = np.array([tuple(values_list)], dtype=dtype)
|
||||
else:
|
||||
new_attr_value = 'missing'
|
||||
|
||||
return new_attr_value
|
||||
|
||||
|
||||
def infer_units(column_name):
|
||||
# TODO: complete or remove
|
||||
|
||||
match = re.search('\[.+\]')
|
||||
|
||||
if match:
|
||||
return match
|
||||
else:
|
||||
match = re.search('\(.+\)')
|
||||
|
||||
return match
|
||||
|
||||
def progressBar(count_value, total, suffix=''):
|
||||
bar_length = 100
|
||||
filled_up_Length = int(round(bar_length* count_value / float(total)))
|
||||
percentage = round(100.0 * count_value/float(total),1)
|
||||
bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
|
||||
sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
|
||||
sys.stdout.flush()
|
||||
|
||||
def copy_directory_with_contraints(input_dir_path, output_dir_path,
|
||||
select_dir_keywords = None,
|
||||
select_file_keywords = None,
|
||||
allowed_file_extensions = None,
|
||||
dry_run = False):
|
||||
"""
|
||||
Copies files from input_dir_path to output_dir_path based on specified constraints.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_dir_path (str): Path to the input directory.
|
||||
output_dir_path (str): Path to the output directory.
|
||||
select_dir_keywords (list): optional, List of keywords for selecting directories.
|
||||
select_file_keywords (list): optional, List of keywords for selecting files.
|
||||
allowed_file_extensions (list): optional, List of allowed file extensions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
path_to_files_dict (dict): dictionary mapping directory paths to lists of copied file names satisfying the constraints.
|
||||
"""
|
||||
|
||||
# Unconstrained default behavior: No filters, make sure variable are lists even when defined as None in function signature
|
||||
select_dir_keywords = select_dir_keywords or []
|
||||
select_file_keywords = select_file_keywords or []
|
||||
allowed_file_extensions = allowed_file_extensions or []
|
||||
|
||||
date = created_at('%Y_%m').replace(":", "-")
|
||||
log_dir='logs/'
|
||||
setup_logging(log_dir, f"copy_directory_with_contraints_{date}.log")
|
||||
|
||||
# Define helper functions. Return by default true when filtering lists are either None or []
|
||||
def has_allowed_extension(filename):
|
||||
return not allowed_file_extensions or os.path.splitext(filename)[1] in allowed_file_extensions
|
||||
|
||||
def file_is_selected(filename):
|
||||
return not select_file_keywords or any(keyword in filename for keyword in select_file_keywords)
|
||||
|
||||
|
||||
# Collect paths of directories, which are directly connected to the root dir and match select_dir_keywords
|
||||
paths = []
|
||||
if select_dir_keywords:
|
||||
for item in os.listdir(input_dir_path): #Path(input_dir_path).iterdir():
|
||||
if any([item in keyword for keyword in select_dir_keywords]):
|
||||
paths.append(os.path.join(input_dir_path,item))
|
||||
else:
|
||||
paths.append(input_dir_path) #paths.append(Path(input_dir_path))
|
||||
|
||||
|
||||
path_to_files_dict = {} # Dictionary to store directory-file pairs satisfying constraints
|
||||
|
||||
for subpath in paths:
|
||||
|
||||
for dirpath, _, filenames in os.walk(subpath,topdown=False):
|
||||
|
||||
# Reduce filenames to those that are admissible
|
||||
admissible_filenames = [
|
||||
filename for filename in filenames
|
||||
if file_is_selected(filename) and has_allowed_extension(filename)
|
||||
]
|
||||
|
||||
if admissible_filenames: # Only create directory if there are files to copy
|
||||
|
||||
relative_dirpath = os.path.relpath(dirpath, input_dir_path)
|
||||
target_dirpath = os.path.join(output_dir_path, relative_dirpath)
|
||||
path_to_files_dict[target_dirpath] = admissible_filenames
|
||||
|
||||
if not dry_run:
|
||||
|
||||
# Perform the actual copying
|
||||
|
||||
os.makedirs(target_dirpath, exist_ok=True)
|
||||
|
||||
for filename in admissible_filenames:
|
||||
src_file_path = os.path.join(dirpath, filename)
|
||||
dest_file_path = os.path.join(target_dirpath, filename)
|
||||
try:
|
||||
shutil.copy2(src_file_path, dest_file_path)
|
||||
except Exception as e:
|
||||
logging.error("Failed to copy %s: %s", src_file_path, e)
|
||||
|
||||
return path_to_files_dict
|
||||
|
||||
def to_serializable_dtype(value):
|
||||
|
||||
"""Transform value's dtype into YAML/JSON compatible dtype
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : _type_
|
||||
_description_
|
||||
|
||||
Returns
|
||||
-------
|
||||
_type_
|
||||
_description_
|
||||
"""
|
||||
try:
|
||||
if isinstance(value, np.generic):
|
||||
if np.issubdtype(value.dtype, np.bytes_):
|
||||
value = value.decode('utf-8')
|
||||
elif np.issubdtype(value.dtype, np.unicode_):
|
||||
value = str(value)
|
||||
elif np.issubdtype(value.dtype, np.number):
|
||||
value = float(value)
|
||||
else:
|
||||
print('Yaml-compatible data-type was not found. Value has been set to NaN.')
|
||||
value = np.nan
|
||||
elif isinstance(value, np.ndarray):
|
||||
# Handling structured array types (with fields)
|
||||
if value.dtype.names:
|
||||
value = {field: to_serializable_dtype(value[field]) for field in value.dtype.names}
|
||||
else:
|
||||
# Handling regular array NumPy types with assumption of unform dtype accross array elements
|
||||
# TODO: evaluate a more general way to check for individual dtypes
|
||||
if isinstance(value[0], bytes):
|
||||
# Decode bytes
|
||||
value = [item.decode('utf-8') for item in value] if len(value) > 1 else value[0].decode('utf-8')
|
||||
elif isinstance(value[0], str):
|
||||
# Already a string type
|
||||
value = [str(item) for item in value] if len(value) > 1 else str(value[0])
|
||||
elif isinstance(value[0], int):
|
||||
# Integer type
|
||||
value = [int(item) for item in value] if len(value) > 1 else int(value[0])
|
||||
elif isinstance(value[0], float):
|
||||
# Floating type
|
||||
value = [float(item) for item in value] if len(value) > 1 else float(value[0])
|
||||
else:
|
||||
print('Yaml-compatible data-type was not found. Value has been set to NaN.')
|
||||
print("Debug: value.dtype is", value.dtype)
|
||||
value = np.nan
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error converting value: {e}. Value has been set to NaN.')
|
||||
value = np.nan
|
||||
|
||||
return value
|
||||
|
||||
def is_structured_array(attr_val):
|
||||
if isinstance(attr_val,np.ndarray):
|
||||
return True if attr_val.dtype.names is not None else False
|
||||
else:
|
||||
import pandas as pd
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import datetime
|
||||
import logging
|
||||
import numpy as np
|
||||
import h5py
|
||||
import re
|
||||
|
||||
|
||||
def setup_logging(log_dir, log_filename):
|
||||
"""Sets up logging to a specified directory and file.
|
||||
|
||||
Parameters:
|
||||
log_dir (str): Directory to save the log file.
|
||||
log_filename (str): Name of the log file.
|
||||
"""
|
||||
# Ensure the log directory exists
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
|
||||
# Create a logger instance
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Create a file handler
|
||||
log_path = os.path.join(log_dir, log_filename)
|
||||
file_handler = logging.FileHandler(log_path)
|
||||
|
||||
# Create a formatter and set it for the handler
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# Add the handler to the logger
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
|
||||
def is_callable_list(x : list):
|
||||
return all([callable(item) for item in x])
|
||||
|
||||
def is_str_list(x : list):
|
||||
return all([isinstance(item,str) for item in x])
|
||||
|
||||
def augment_with_filetype(df):
|
||||
df['filetype'] = [os.path.splitext(item)[1][1::] for item in df['filename']]
|
||||
#return [os.path.splitext(item)[1][1::] for item in df['filename']]
|
||||
return df
|
||||
|
||||
def augment_with_filenumber(df):
|
||||
df['filenumber'] = [item[0:item.find('_')] for item in df['filename']]
|
||||
#return [item[0:item.find('_')] for item in df['filename']]
|
||||
return df
|
||||
|
||||
def group_by_df_column(df, column_name: str):
|
||||
"""
|
||||
df (pandas.DataFrame):
|
||||
column_name (str): column_name of df by which grouping operation will take place.
|
||||
"""
|
||||
|
||||
if not column_name in df.columns:
|
||||
raise ValueError("column_name must be in the columns of df.")
|
||||
|
||||
return df[column_name]
|
||||
|
||||
def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame):
|
||||
|
||||
sample_name = []
|
||||
sample_quality = []
|
||||
for item in input_data['sample']:
|
||||
if item.find('(')!=-1:
|
||||
#print(item)
|
||||
sample_name.append(item[0:item.find('(')])
|
||||
sample_quality.append(item[item.find('(')+1:len(item)-1])
|
||||
else:
|
||||
if item=='':
|
||||
sample_name.append('Not yet annotated')
|
||||
sample_quality.append('unevaluated')
|
||||
else:
|
||||
sample_name.append(item)
|
||||
sample_quality.append('good data')
|
||||
input_data['sample'] = sample_name
|
||||
input_data['data_quality'] = sample_quality
|
||||
|
||||
return input_data
|
||||
|
||||
def make_file_copy(source_file_path, output_folder_name : str = 'tmp_files'):
|
||||
|
||||
pathtail, filename = os.path.split(source_file_path)
|
||||
#backup_filename = 'backup_'+ filename
|
||||
backup_filename = filename
|
||||
# Path
|
||||
ROOT_DIR = os.path.abspath(os.curdir)
|
||||
|
||||
tmp_dirpath = os.path.join(ROOT_DIR,output_folder_name)
|
||||
if not os.path.exists(tmp_dirpath):
|
||||
os.mkdir(tmp_dirpath)
|
||||
|
||||
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
|
||||
shutil.copy(source_file_path, tmp_file_path)
|
||||
|
||||
return tmp_file_path
|
||||
|
||||
def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'):
|
||||
now = datetime.datetime.now()
|
||||
# Populate now object with time zone information obtained from the local system
|
||||
now_tz_aware = now.astimezone()
|
||||
tz = now_tz_aware.strftime('%z')
|
||||
# Replace colons in the time part of the timestamp with hyphens to make it file name friendly
|
||||
created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz
|
||||
return created_at
|
||||
|
||||
def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format)
|
||||
datetime_cols = df.select_dtypes(include=['datetime']).columns
|
||||
for col in datetime_cols:
|
||||
# Convert datetime to string in the specified format, handling NaT
|
||||
df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
|
||||
|
||||
# Handle object columns with mixed types
|
||||
otype_cols = df.select_dtypes(include='O')
|
||||
for col in otype_cols:
|
||||
col_data = df[col]
|
||||
|
||||
# Check if all elements in the column are strings
|
||||
if col_data.apply(lambda x: isinstance(x, str)).all():
|
||||
df[col] = df[col].astype(str)
|
||||
else:
|
||||
# If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN
|
||||
df[col] = pd.to_numeric(col_data, errors='coerce')
|
||||
|
||||
# Handle NaN values differently based on dtype
|
||||
if pd.api.types.is_string_dtype(df[col]):
|
||||
# Replace NaN in string columns with empty string
|
||||
df[col] = df[col].fillna('') # Replace NaN with empty string
|
||||
elif pd.api.types.is_numeric_dtype(df[col]):
|
||||
# For numeric columns, we want to keep NaN as it is
|
||||
# But if integer column has NaN, consider casting to float
|
||||
if pd.api.types.is_integer_dtype(df[col]):
|
||||
df[col] = df[col].astype(float) # Cast to float to allow NaN
|
||||
else:
|
||||
df[col] = df[col].fillna(np.nan) # Keep NaN in float columns
|
||||
|
||||
return df
|
||||
|
||||
def convert_dataframe_to_np_structured_array(df: pd.DataFrame):
|
||||
|
||||
df = sanitize_dataframe(df)
|
||||
# Define the dtype for the structured array, ensuring compatibility with h5py
|
||||
dtype = []
|
||||
for col in df.columns:
|
||||
|
||||
col_data = df[col]
|
||||
col_dtype = col_data.dtype
|
||||
|
||||
try:
|
||||
if pd.api.types.is_string_dtype(col_dtype):
|
||||
# Convert string dtype to fixed-length strings
|
||||
max_len = col_data.str.len().max() if not col_data.isnull().all() else 0
|
||||
dtype.append((col, f'S{max_len}'))
|
||||
elif pd.api.types.is_integer_dtype(col_dtype):
|
||||
dtype.append((col, 'i4')) # Assuming 32-bit integer
|
||||
elif pd.api.types.is_float_dtype(col_dtype):
|
||||
dtype.append((col, 'f4')) # Assuming 32-bit float
|
||||
else:
|
||||
# Handle unsupported data types
|
||||
print(f"Unsupported dtype found in column '{col}': {col_data.dtype}")
|
||||
raise ValueError(f"Unsupported data type: {col_data.dtype}")
|
||||
|
||||
except Exception as e:
|
||||
# Log more detailed error message
|
||||
print(f"Error processing column '{col}': {e}")
|
||||
raise
|
||||
|
||||
# Convert the DataFrame to a structured array
|
||||
structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
|
||||
|
||||
return structured_array
|
||||
|
||||
def convert_string_to_bytes(input_list: list):
|
||||
"""Convert a list of strings into a numpy array with utf8-type entries.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_list (list) : list of string objects
|
||||
|
||||
Returns
|
||||
-------
|
||||
input_array_bytes (ndarray): array of ut8-type entries.
|
||||
"""
|
||||
utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
|
||||
if input_list:
|
||||
max_length = max(len(item) for item in input_list)
|
||||
# Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
|
||||
input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
|
||||
input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length))
|
||||
else:
|
||||
input_array_bytes = np.array([],dtype=utf8_type(0))
|
||||
|
||||
return input_array_bytes
|
||||
|
||||
def convert_attrdict_to_np_structured_array(attr_value: dict):
|
||||
"""
|
||||
Converts a dictionary of attributes into a numpy structured array for HDF5
|
||||
compound type compatibility.
|
||||
|
||||
Each dictionary key is mapped to a field in the structured array, with the
|
||||
data type (S) determined by the longest string representation of the values.
|
||||
If the dictionary is empty, the function returns 'missing'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
attr_value : dict
|
||||
Dictionary containing the attributes to be converted. Example:
|
||||
attr_value = {
|
||||
'name': 'Temperature',
|
||||
'unit': 'Celsius',
|
||||
'value': 23.5,
|
||||
'timestamp': '2023-09-26 10:00'
|
||||
}
|
||||
|
||||
Returns
|
||||
-------
|
||||
new_attr_value : ndarray or str
|
||||
Numpy structured array with UTF-8 encoded fields. Returns 'missing' if
|
||||
the input dictionary is empty.
|
||||
"""
|
||||
dtype = []
|
||||
values_list = []
|
||||
max_length = max(len(str(attr_value[key])) for key in attr_value.keys())
|
||||
for key in attr_value.keys():
|
||||
if key != 'rename_as':
|
||||
dtype.append((key, f'S{max_length}'))
|
||||
values_list.append(attr_value[key])
|
||||
if values_list:
|
||||
new_attr_value = np.array([tuple(values_list)], dtype=dtype)
|
||||
else:
|
||||
new_attr_value = 'missing'
|
||||
|
||||
return new_attr_value
|
||||
|
||||
|
||||
def infer_units(column_name):
|
||||
# TODO: complete or remove
|
||||
|
||||
match = re.search('\[.+\]')
|
||||
|
||||
if match:
|
||||
return match
|
||||
else:
|
||||
match = re.search('\(.+\)')
|
||||
|
||||
return match
|
||||
|
||||
def progressBar(count_value, total, suffix=''):
|
||||
bar_length = 100
|
||||
filled_up_Length = int(round(bar_length* count_value / float(total)))
|
||||
percentage = round(100.0 * count_value/float(total),1)
|
||||
bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
|
||||
sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
|
||||
sys.stdout.flush()
|
||||
|
||||
def copy_directory_with_contraints(input_dir_path, output_dir_path,
|
||||
select_dir_keywords = None,
|
||||
select_file_keywords = None,
|
||||
allowed_file_extensions = None,
|
||||
dry_run = False):
|
||||
"""
|
||||
Copies files from input_dir_path to output_dir_path based on specified constraints.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_dir_path (str): Path to the input directory.
|
||||
output_dir_path (str): Path to the output directory.
|
||||
select_dir_keywords (list): optional, List of keywords for selecting directories.
|
||||
select_file_keywords (list): optional, List of keywords for selecting files.
|
||||
allowed_file_extensions (list): optional, List of allowed file extensions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
path_to_files_dict (dict): dictionary mapping directory paths to lists of copied file names satisfying the constraints.
|
||||
"""
|
||||
|
||||
# Unconstrained default behavior: No filters, make sure variable are lists even when defined as None in function signature
|
||||
select_dir_keywords = select_dir_keywords or []
|
||||
select_file_keywords = select_file_keywords or []
|
||||
allowed_file_extensions = allowed_file_extensions or []
|
||||
|
||||
date = created_at('%Y_%m').replace(":", "-")
|
||||
log_dir='logs/'
|
||||
setup_logging(log_dir, f"copy_directory_with_contraints_{date}.log")
|
||||
|
||||
# Define helper functions. Return by default true when filtering lists are either None or []
|
||||
def has_allowed_extension(filename):
|
||||
return not allowed_file_extensions or os.path.splitext(filename)[1] in allowed_file_extensions
|
||||
|
||||
def file_is_selected(filename):
|
||||
return not select_file_keywords or any(keyword in filename for keyword in select_file_keywords)
|
||||
|
||||
|
||||
# Collect paths of directories, which are directly connected to the root dir and match select_dir_keywords
|
||||
paths = []
|
||||
if select_dir_keywords:
|
||||
for item in os.listdir(input_dir_path): #Path(input_dir_path).iterdir():
|
||||
if any([item in keyword for keyword in select_dir_keywords]):
|
||||
paths.append(os.path.join(input_dir_path,item))
|
||||
else:
|
||||
paths.append(input_dir_path) #paths.append(Path(input_dir_path))
|
||||
|
||||
|
||||
path_to_files_dict = {} # Dictionary to store directory-file pairs satisfying constraints
|
||||
|
||||
for subpath in paths:
|
||||
|
||||
for dirpath, _, filenames in os.walk(subpath,topdown=False):
|
||||
|
||||
# Reduce filenames to those that are admissible
|
||||
admissible_filenames = [
|
||||
filename for filename in filenames
|
||||
if file_is_selected(filename) and has_allowed_extension(filename)
|
||||
]
|
||||
|
||||
if admissible_filenames: # Only create directory if there are files to copy
|
||||
|
||||
relative_dirpath = os.path.relpath(dirpath, input_dir_path)
|
||||
target_dirpath = os.path.join(output_dir_path, relative_dirpath)
|
||||
path_to_files_dict[target_dirpath] = admissible_filenames
|
||||
|
||||
if not dry_run:
|
||||
|
||||
# Perform the actual copying
|
||||
|
||||
os.makedirs(target_dirpath, exist_ok=True)
|
||||
|
||||
for filename in admissible_filenames:
|
||||
src_file_path = os.path.join(dirpath, filename)
|
||||
dest_file_path = os.path.join(target_dirpath, filename)
|
||||
try:
|
||||
shutil.copy2(src_file_path, dest_file_path)
|
||||
except Exception as e:
|
||||
logging.error("Failed to copy %s: %s", src_file_path, e)
|
||||
|
||||
return path_to_files_dict
|
||||
|
||||
def to_serializable_dtype(value):
|
||||
|
||||
"""Transform value's dtype into YAML/JSON compatible dtype
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : _type_
|
||||
_description_
|
||||
|
||||
Returns
|
||||
-------
|
||||
_type_
|
||||
_description_
|
||||
"""
|
||||
try:
|
||||
if isinstance(value, np.generic):
|
||||
if np.issubdtype(value.dtype, np.bytes_):
|
||||
value = value.decode('utf-8')
|
||||
elif np.issubdtype(value.dtype, np.unicode_):
|
||||
value = str(value)
|
||||
elif np.issubdtype(value.dtype, np.number):
|
||||
value = float(value)
|
||||
else:
|
||||
print('Yaml-compatible data-type was not found. Value has been set to NaN.')
|
||||
value = np.nan
|
||||
elif isinstance(value, np.ndarray):
|
||||
# Handling structured array types (with fields)
|
||||
if value.dtype.names:
|
||||
value = {field: to_serializable_dtype(value[field]) for field in value.dtype.names}
|
||||
else:
|
||||
# Handling regular array NumPy types with assumption of unform dtype accross array elements
|
||||
# TODO: evaluate a more general way to check for individual dtypes
|
||||
if isinstance(value[0], bytes):
|
||||
# Decode bytes
|
||||
value = [item.decode('utf-8') for item in value] if len(value) > 1 else value[0].decode('utf-8')
|
||||
elif isinstance(value[0], str):
|
||||
# Already a string type
|
||||
value = [str(item) for item in value] if len(value) > 1 else str(value[0])
|
||||
elif isinstance(value[0], int):
|
||||
# Integer type
|
||||
value = [int(item) for item in value] if len(value) > 1 else int(value[0])
|
||||
elif isinstance(value[0], float):
|
||||
# Floating type
|
||||
value = [float(item) for item in value] if len(value) > 1 else float(value[0])
|
||||
else:
|
||||
print('Yaml-compatible data-type was not found. Value has been set to NaN.')
|
||||
print("Debug: value.dtype is", value.dtype)
|
||||
value = np.nan
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error converting value: {e}. Value has been set to NaN.')
|
||||
value = np.nan
|
||||
|
||||
return value
|
||||
|
||||
def is_structured_array(attr_val):
|
||||
if isinstance(attr_val,np.ndarray):
|
||||
return True if attr_val.dtype.names is not None else False
|
||||
else:
|
||||
return False
|
Reference in New Issue
Block a user