From fd92bce802dc20b74eae3fd404125a4fa35f38e6 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Sat, 23 Nov 2024 16:28:49 +0100 Subject: [PATCH] Implemented sanitize dataframe function to deal with 'O' which may have numbers or strings detected as string types. Then we use it prior to convert dataframe into structured numpy array. --- utils/g5505_utils.py | 67 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/utils/g5505_utils.py b/utils/g5505_utils.py index 2343181..53c53fd 100644 --- a/utils/g5505_utils.py +++ b/utils/g5505_utils.py @@ -109,22 +109,67 @@ def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'): created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz return created_at +def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame: + # Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format) + datetime_cols = df.select_dtypes(include=['datetime']).columns + for col in datetime_cols: + # Convert datetime to string in the specified format, handling NaT + df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S') + + # Handle object columns with mixed types + otype_cols = df.select_dtypes(include='O') + for col in otype_cols: + col_data = df[col] + + # Check if all elements in the column are strings + if col_data.apply(lambda x: isinstance(x, str)).all(): + df[col] = df[col].astype(str) + else: + # If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN + df[col] = pd.to_numeric(col_data, errors='coerce') + + # Handle NaN values differently based on dtype + if pd.api.types.is_string_dtype(df[col]): + # Replace NaN in string columns with empty string + df[col] = df[col].fillna('') # Replace NaN with empty string + elif pd.api.types.is_numeric_dtype(df[col]): + # For numeric columns, we want to keep NaN as it is + # But if integer column has NaN, consider casting to float + if pd.api.types.is_integer_dtype(df[col]): + df[col] = df[col].astype(float) # Cast to float to allow NaN + else: + df[col] = df[col].fillna(np.nan) # Keep NaN in float columns + + return df + def convert_dataframe_to_np_structured_array(df: pd.DataFrame): + df = sanitize_dataframe(df) # Define the dtype for the structured array, ensuring compatibility with h5py dtype = [] for col in df.columns: - col_dtype = df[col].dtype - if pd.api.types.is_string_dtype(col_dtype): - # Convert string dtype to fixed-length strings - max_len = df[col].str.len().max() - dtype.append((col, f'S{max_len}')) - elif pd.api.types.is_integer_dtype(col_dtype): - dtype.append((col, 'i4')) # Assuming 32-bit integer - elif pd.api.types.is_float_dtype(col_dtype): - dtype.append((col, 'f4')) # Assuming 32-bit float - else: - raise ValueError(f"Unsupported dtype: {col_dtype}") + + col_data = df[col] + col_dtype = col_data.dtype + + try: + if pd.api.types.is_string_dtype(col_dtype): + # Convert string dtype to fixed-length strings + max_len = col_data.str.len().max() if not col_data.isnull().all() else 0 + dtype.append((col, f'S{max_len}')) + elif pd.api.types.is_integer_dtype(col_dtype): + dtype.append((col, 'i4')) # Assuming 32-bit integer + elif pd.api.types.is_float_dtype(col_dtype): + dtype.append((col, 'f4')) # Assuming 32-bit float + else: + # Handle unsupported data types + print(f"Unsupported dtype found in column '{col}': {col_data.dtype}") + raise ValueError(f"Unsupported data type: {col_data.dtype}") + + except Exception as e: + # Log more detailed error message + print(f"Error processing column '{col}': {e}") + raise # Convert the DataFrame to a structured array structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)