From fd92bce802dc20b74eae3fd404125a4fa35f38e6 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Sat, 23 Nov 2024 16:28:49 +0100
Subject: [PATCH] Implemented sanitize dataframe function to deal with 'O'
 which may have numbers or strings detected as string types. Then we use it
 prior to convert dataframe into structured numpy array.

---
 utils/g5505_utils.py | 67 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 11 deletions(-)

diff --git a/utils/g5505_utils.py b/utils/g5505_utils.py
index 2343181..53c53fd 100644
--- a/utils/g5505_utils.py
+++ b/utils/g5505_utils.py
@@ -109,22 +109,67 @@ def created_at(datetime_format = '%Y-%m-%d %H:%M:%S'):
     created_at = now_tz_aware.strftime(datetime_format) #+ '_UTC-OFST_' + tz
     return created_at
 
+def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    # Handle datetime columns (convert to string in 'yyyy-mm-dd hh:mm:ss' format)
+    datetime_cols = df.select_dtypes(include=['datetime']).columns
+    for col in datetime_cols:
+        # Convert datetime to string in the specified format, handling NaT
+        df[col] = df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
+
+    # Handle object columns with mixed types
+    otype_cols = df.select_dtypes(include='O')
+    for col in otype_cols:
+        col_data = df[col]
+
+        # Check if all elements in the column are strings
+        if col_data.apply(lambda x: isinstance(x, str)).all():
+            df[col] = df[col].astype(str)
+        else:
+            # If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN
+            df[col] = pd.to_numeric(col_data, errors='coerce')
+
+        # Handle NaN values differently based on dtype
+        if pd.api.types.is_string_dtype(df[col]):
+            # Replace NaN in string columns with empty string
+            df[col] = df[col].fillna('')  # Replace NaN with empty string
+        elif pd.api.types.is_numeric_dtype(df[col]):
+            # For numeric columns, we want to keep NaN as it is
+            # But if integer column has NaN, consider casting to float
+            if pd.api.types.is_integer_dtype(df[col]):
+                df[col] = df[col].astype(float)  # Cast to float to allow NaN
+            else:
+                df[col] = df[col].fillna(np.nan)  # Keep NaN in float columns
+
+    return df
+
 def convert_dataframe_to_np_structured_array(df: pd.DataFrame):
 
+    df = sanitize_dataframe(df)
      # Define the dtype for the structured array, ensuring compatibility with h5py
     dtype = []
     for col in df.columns:
-        col_dtype = df[col].dtype
-        if pd.api.types.is_string_dtype(col_dtype):
-            # Convert string dtype to fixed-length strings
-            max_len = df[col].str.len().max()
-            dtype.append((col, f'S{max_len}'))
-        elif pd.api.types.is_integer_dtype(col_dtype):
-            dtype.append((col, 'i4'))  # Assuming 32-bit integer
-        elif pd.api.types.is_float_dtype(col_dtype):
-            dtype.append((col, 'f4'))  # Assuming 32-bit float
-        else:
-            raise ValueError(f"Unsupported dtype: {col_dtype}")
+
+        col_data = df[col]
+        col_dtype = col_data.dtype
+
+        try:
+            if pd.api.types.is_string_dtype(col_dtype):
+                # Convert string dtype to fixed-length strings
+                max_len = col_data.str.len().max() if not col_data.isnull().all() else 0
+                dtype.append((col, f'S{max_len}'))
+            elif pd.api.types.is_integer_dtype(col_dtype):
+                dtype.append((col, 'i4'))  # Assuming 32-bit integer
+            elif pd.api.types.is_float_dtype(col_dtype):
+                dtype.append((col, 'f4'))  # Assuming 32-bit float
+            else:
+                # Handle unsupported data types
+                print(f"Unsupported dtype found in column '{col}': {col_data.dtype}")
+                raise ValueError(f"Unsupported data type: {col_data.dtype}")
+            
+        except Exception as e:
+            # Log more detailed error message
+            print(f"Error processing column '{col}': {e}")
+            raise
 
     # Convert the DataFrame to a structured array
     structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)