Refactor spreadsheet processing to improve validation logic

Enhanced value cleaning and validation for spreadsheet data with dynamic handling of columns and corrections. Improved feedback for users with detailed error messages and visual indicators for corrected or defaulted values. Simplified backend and frontend logic for better maintainability and usability.
This commit is contained in:
GotthardG
2025-01-13 21:55:15 +01:00
parent f855930340
commit f6c19cc4da
5 changed files with 221 additions and 110 deletions

View File

@ -1,5 +1,6 @@
import logging
import openpyxl
import re
from pydantic import ValidationError
from typing import List, Tuple
from io import BytesIO
@ -61,14 +62,40 @@ class SampleSpreadsheetImporter:
# Return type if column exists, else default to str
return column_type_mapping.get(column_name, str)
def _clean_value(self, value, expected_type=None):
if value is None:
def _clean_value(self, value, expected_type=None, column_name=None):
"""
Cleans and validates the given value based on its expected type.
Different behavior is applied to specific columns if needed.
"""
if value is None or (isinstance(value, str) and value.strip() == ""):
# Handle empty or None values
if column_name == "directory":
logger.warning("Directory value is empty. Assigning default value.")
self.default_set = True # Flag to indicate a default value is set.
return "{sgPuck}/{sgPosition}" # Default directory
self.default_set = False
return None
# Convert to string and strip whitespaces
cleaned_value = str(value).strip()
# Handle specific column behaviors
if expected_type == str:
return str(value).strip()
if expected_type in [float, int]:
if expected_type == str:
if column_name is None:
logger.warning(f"Missing column_name for value: {value}")
elif column_name == "comments":
return " ".join(cleaned_value.split()) # Normalize excessive spaces
else:
# Replace spaces with underscores for general string columns
return cleaned_value.replace(" ", "_")
elif expected_type in [int, float]:
try:
return expected_type(value)
# Remove any invalid characters and cast to the expected type
cleaned_value = re.sub(r"[^\d.]", "", cleaned_value)
return expected_type(cleaned_value)
except (ValueError, TypeError) as e:
logger.error(
f"Failed to cast value '{value}' to {expected_type}. Error: {e}"
@ -76,9 +103,9 @@ class SampleSpreadsheetImporter:
raise ValueError(
f"Invalid value: '{value}'. Expected type: {expected_type}."
)
# Fallback for unhandled types
logger.warning(f"Unhandled type for value: '{value}'. Returning as-is.")
return value
# Return cleaned value for other types
return cleaned_value
def import_spreadsheet(self, file):
return self.import_spreadsheet_with_errors(file)
@ -180,67 +207,67 @@ class SampleSpreadsheetImporter:
if len(row) < expected_columns:
row = list(row) + [None] * (expected_columns - len(row))
# Prepare the record with cleaned values
record = {
"dewarname": self._clean_value(row[0], str),
"puckname": self._clean_value(row[1], str),
"pucktype": self._clean_value(row[2], str),
"crystalname": self._clean_value(row[3], str),
"positioninpuck": self._clean_value(row[4], int),
"priority": self._clean_value(row[5], int),
"comments": self._clean_value(row[6], str),
"proteinname": self._clean_value(row[8], str),
}
# Prepare the record dynamically based on headers
record = {}
for col_idx, column_name in enumerate(headers):
original_value = row[col_idx] if col_idx < len(row) else None
expected_type = self.get_expected_type(column_name)
# Call _clean_value dynamically with the correct column_name
try:
cleaned_value = self._clean_value(
original_value, expected_type, column_name
)
record[column_name] = cleaned_value
except (ValueError, TypeError) as e:
logger.error(
f"Validation error for row {index + 4},"
f" column '{column_name}': "
f"{str(e)}"
)
errors.append(
{
"row": index + 4,
"column": column_name,
"value": original_value,
"message": str(e),
}
)
# Nested processing for data_collection_parameters
record["data_collection_parameters"] = {
"directory": self._clean_value(row[7], str),
"oscillation": self._clean_value(row[9], float),
"aperture": self._clean_value(row[10], str),
"exposure": self._clean_value(row[11], float),
"totalrange": self._clean_value(row[12], float),
"transmission": self._clean_value(row[13], int),
"dose": self._clean_value(row[14], float),
"targetresolution": self._clean_value(row[15], float),
"datacollectiontype": self._clean_value(row[16], str),
"processingpipeline": self._clean_value(row[17], str),
"spacegroupnumber": self._clean_value(row[18], int),
"cellparameters": self._clean_value(row[19], str),
"rescutkey": self._clean_value(row[20], str),
"rescutvalue": self._clean_value(row[21], str),
"userresolution": self._clean_value(row[22], str),
"pdbid": self._clean_value(row[23], str),
"autoprocfull": self._clean_value(row[24], str),
"procfull": self._clean_value(row[25], str),
"adpenabled": self._clean_value(row[26], str),
"noano": self._clean_value(row[27], str),
"ffcscampaign": self._clean_value(row[28], str),
"trustedhigh": self._clean_value(row[29], str),
"autoprocextraparams": self._clean_value(row[30], str),
"chiphiangles": self._clean_value(row[31], str),
"directory": record.get("directory"),
"oscillation": record.get("oscillation"),
"aperture": record.get("aperture"),
"exposure": record.get("exposure"),
"totalrange": record.get("totalrange"),
"transmission": record.get("transmission"),
"dose": record.get("dose"),
"targetresolution": record.get("targetresolution"),
"datacollectiontype": record.get("datacollectiontype"),
"processingpipeline": record.get("processingpipeline"),
"spacegroupnumber": record.get("spacegroupnumber"),
"cellparameters": record.get("cellparameters"),
"rescutkey": record.get("rescutkey"),
"rescutvalue": record.get("rescutvalue"),
"userresolution": record.get("userresolution"),
"pdbid": record.get("pdbid"),
"autoprocfull": record.get("autoprocfull"),
"procfull": record.get("procfull"),
"adpenabled": record.get("adpenabled"),
"noano": record.get("noano"),
"ffcscampaign": record.get("ffcscampaign"),
"trustedhigh": record.get("trustedhigh"),
"autoprocextraparams": record.get("autoprocextraparams"),
"chiphiangles": record.get("chiphiangles"),
}
try:
# Validate the record
validated_record = SpreadsheetModel(**record)
# Get the corrected `directory`
corrected_directory = (
validated_record.data_collection_parameters.directory
)
# Update `raw_data` to reflect the corrected value
raw_data[-1]["data"][
7
] = corrected_directory # Replace directory in raw data
raw_data[-1][
"directory"
] = corrected_directory # Add a top-level "directory" key
raw_data[-1]["default_set"] = (
corrected_directory == "{sgPuck}/{sgPosition}"
)
# Add validated record to the model
model.append(validated_record)
except ValidationError as e:
logger.error(f"Validation error in row {index + 4}: {e}")
for error in e.errors():