Implemented sanitize dataframe function to deal with 'O' which may have numbers or strings detected as string types. Then we use it prior to convert dataframe into structured numpy array.

This commit is contained in:
2024-11-23 16:28:49 +01:00
parent 8ab2cb3bdb
commit fd92bce802

View File

@ -109,22 +109,67 @@ def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'):
created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz
return created_at
def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
# Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format)
datetime_cols = df.select_dtypes(include=['datetime']).columns
for col in datetime_cols:
# Convert datetime to string in the specified format, handling NaT
df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
# Handle object columns with mixed types
otype_cols = df.select_dtypes(include='O')
for col in otype_cols:
col_data = df[col]
# Check if all elements in the column are strings
if col_data.apply(lambda x: isinstance(x, str)).all():
df[col] = df[col].astype(str)
else:
# If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN
df[col] = pd.to_numeric(col_data, errors='coerce')
# Handle NaN values differently based on dtype
if pd.api.types.is_string_dtype(df[col]):
# Replace NaN in string columns with empty string
df[col] = df[col].fillna('') # Replace NaN with empty string
elif pd.api.types.is_numeric_dtype(df[col]):
# For numeric columns, we want to keep NaN as it is
# But if integer column has NaN, consider casting to float
if pd.api.types.is_integer_dtype(df[col]):
df[col] = df[col].astype(float) # Cast to float to allow NaN
else:
df[col] = df[col].fillna(np.nan) # Keep NaN in float columns
return df
def convert_dataframe_to_np_structured_array(df: pd.DataFrame):
df = sanitize_dataframe(df)
# Define the dtype for the structured array, ensuring compatibility with h5py
dtype = []
for col in df.columns:
col_dtype = df[col].dtype
if pd.api.types.is_string_dtype(col_dtype):
# Convert string dtype to fixed-length strings
max_len = df[col].str.len().max()
dtype.append((col, f'S{max_len}'))
elif pd.api.types.is_integer_dtype(col_dtype):
dtype.append((col, 'i4')) # Assuming 32-bit integer
elif pd.api.types.is_float_dtype(col_dtype):
dtype.append((col, 'f4')) # Assuming 32-bit float
else:
raise ValueError(f"Unsupported dtype: {col_dtype}")
col_data = df[col]
col_dtype = col_data.dtype
try:
if pd.api.types.is_string_dtype(col_dtype):
# Convert string dtype to fixed-length strings
max_len = col_data.str.len().max() if not col_data.isnull().all() else 0
dtype.append((col, f'S{max_len}'))
elif pd.api.types.is_integer_dtype(col_dtype):
dtype.append((col, 'i4')) # Assuming 32-bit integer
elif pd.api.types.is_float_dtype(col_dtype):
dtype.append((col, 'f4')) # Assuming 32-bit float
else:
# Handle unsupported data types
print(f"Unsupported dtype found in column '{col}': {col_data.dtype}")
raise ValueError(f"Unsupported data type: {col_data.dtype}")
except Exception as e:
# Log more detailed error message
print(f"Error processing column '{col}': {e}")
raise
# Convert the DataFrame to a structured array
structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)