Implemented sanitize dataframe function to deal with 'O' which may have numbers or strings detected as string types. Then we use it prior to convert dataframe into structured numpy array.

2024-11-23 16:28:49 +01:00
parent 8ab2cb3bdb
commit fd92bce802
1 changed files with 56 additions and 11 deletions
--- a/utils/g5505_utils.py
+++ b/utils/g5505_utils.py
@ -109,22 +109,67 @@ def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'):
    created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz
    return created_at

+def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    # Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format)
+    datetime_cols = df.select_dtypes(include=['datetime']).columns
+    for col in datetime_cols:
+        # Convert datetime to string in the specified format, handling NaT
+        df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
+
+    # Handle object columns with mixed types
+    otype_cols = df.select_dtypes(include='O')
+    for col in otype_cols:
+        col_data = df[col]
+
+        # Check if all elements in the column are strings
+        if col_data.apply(lambda x: isinstance(x, str)).all():
+            df[col] = df[col].astype(str)
+        else:
+            # If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN
+            df[col] = pd.to_numeric(col_data, errors='coerce')
+
+        # Handle NaN values differently based on dtype
+        if pd.api.types.is_string_dtype(df[col]):
+            # Replace NaN in string columns with empty string
+            df[col] = df[col].fillna('')  # Replace NaN with empty string
+        elif pd.api.types.is_numeric_dtype(df[col]):
+            # For numeric columns, we want to keep NaN as it is
+            # But if integer column has NaN, consider casting to float
+            if pd.api.types.is_integer_dtype(df[col]):
+                df[col] = df[col].astype(float)  # Cast to float to allow NaN
+            else:
+                df[col] = df[col].fillna(np.nan)  # Keep NaN in float columns
+
+    return df
+
 def convert_dataframe_to_np_structured_array(df: pd.DataFrame):

+    df = sanitize_dataframe(df)
     # Define the dtype for the structured array, ensuring compatibility with h5py
    dtype = []
    for col in df.columns:
-        col_dtype = df[col].dtype
-        if pd.api.types.is_string_dtype(col_dtype):
-            # Convert string dtype to fixed-length strings
-            max_len = df[col].str.len().max()
-            dtype.append((col, f'S{max_len}'))
-        elif pd.api.types.is_integer_dtype(col_dtype):
-            dtype.append((col, 'i4'))  # Assuming 32-bit integer
-        elif pd.api.types.is_float_dtype(col_dtype):
-            dtype.append((col, 'f4'))  # Assuming 32-bit float
-        else:
-            raise ValueError(f"Unsupported dtype: {col_dtype}")
+
+        col_data = df[col]
+        col_dtype = col_data.dtype
+
+        try:
+            if pd.api.types.is_string_dtype(col_dtype):
+                # Convert string dtype to fixed-length strings
+                max_len = col_data.str.len().max() if not col_data.isnull().all() else 0
+                dtype.append((col, f'S{max_len}'))
+            elif pd.api.types.is_integer_dtype(col_dtype):
+                dtype.append((col, 'i4'))  # Assuming 32-bit integer
+            elif pd.api.types.is_float_dtype(col_dtype):
+                dtype.append((col, 'f4'))  # Assuming 32-bit float
+            else:
+                # Handle unsupported data types
+                print(f"Unsupported dtype found in column '{col}': {col_data.dtype}")
+                raise ValueError(f"Unsupported data type: {col_data.dtype}")
+            
+        except Exception as e:
+            # Log more detailed error message
+            print(f"Error processing column '{col}': {e}")
+            raise

    # Convert the DataFrame to a structured array
    structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)