Refactor spreadsheet handling to track corrections and defaults
Improved the backend's value cleaning to differentiate between corrections and defaults, logging metadata for clearer traceability. Updated frontend to display corrected/defaulted fields with visual cues and tooltips for better user feedback. Enhanced data models and response structures to support this richer metadata.
This commit is contained in:
@ -65,37 +65,33 @@ class SampleSpreadsheetImporter:
|
||||
def _clean_value(self, value, expected_type=None, column_name=None):
|
||||
"""
|
||||
Cleans and validates the given value based on its expected type.
|
||||
Different behavior is applied to specific columns if needed.
|
||||
Tracks corrections and defaults applied separately.
|
||||
"""
|
||||
default_applied = False
|
||||
|
||||
# If the value is None or empty string
|
||||
if value is None or (isinstance(value, str) and value.strip() == ""):
|
||||
# Handle empty or None values
|
||||
if column_name == "directory":
|
||||
logger.warning("Directory value is empty. Assigning default value.")
|
||||
self.default_set = True # Flag to indicate a default value is set.
|
||||
return "{sgPuck}/{sgPosition}" # Default directory
|
||||
self.default_set = False
|
||||
return None
|
||||
default_applied = True
|
||||
return "{sgPuck}/{sgPosition}", default_applied
|
||||
|
||||
# Convert to string and strip whitespaces
|
||||
return None, default_applied
|
||||
|
||||
# Clean up the value
|
||||
cleaned_value = str(value).strip()
|
||||
|
||||
# Handle specific column behaviors
|
||||
# Handle `type` casting logic
|
||||
if expected_type == str:
|
||||
if expected_type == str:
|
||||
if column_name is None:
|
||||
logger.warning(f"Missing column_name for value: {value}")
|
||||
elif column_name == "comments":
|
||||
return " ".join(cleaned_value.split()) # Normalize excessive spaces
|
||||
|
||||
else:
|
||||
# Replace spaces with underscores for general string columns
|
||||
return cleaned_value.replace(" ", "_")
|
||||
if column_name == "comments":
|
||||
return " ".join(cleaned_value.split()), default_applied
|
||||
if " " in cleaned_value:
|
||||
cleaned_value = cleaned_value.replace(" ", "_")
|
||||
|
||||
elif expected_type in [int, float]:
|
||||
try:
|
||||
# Remove any invalid characters and cast to the expected type
|
||||
cleaned_value = re.sub(r"[^\d.]", "", cleaned_value)
|
||||
return expected_type(cleaned_value)
|
||||
cleaned_value = expected_type(cleaned_value)
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(
|
||||
f"Failed to cast value '{value}' to {expected_type}. Error: {e}"
|
||||
@ -104,8 +100,15 @@ class SampleSpreadsheetImporter:
|
||||
f"Invalid value: '{value}'. Expected type: {expected_type}."
|
||||
)
|
||||
|
||||
# Return cleaned value for other types
|
||||
return cleaned_value
|
||||
# Avoid marking `None -> None` as a correction
|
||||
if cleaned_value == value:
|
||||
default_applied = (
|
||||
False # Ensure default_applied stays False for unchanged `value`.
|
||||
)
|
||||
|
||||
if not isinstance(cleaned_value, (str, int, float)):
|
||||
raise TypeError(f"Unexpected type for cleaned value: {type(cleaned_value)}")
|
||||
return cleaned_value, default_applied
|
||||
|
||||
def import_spreadsheet(self, file):
|
||||
return self.import_spreadsheet_with_errors(file)
|
||||
@ -200,30 +203,49 @@ class SampleSpreadsheetImporter:
|
||||
logger.debug(f"Skipping empty row at index {index}")
|
||||
continue
|
||||
|
||||
# Record raw data for later use
|
||||
raw_data.append({"row_num": index + 4, "data": list(row)})
|
||||
|
||||
# Ensure row has the expected number of columns
|
||||
if len(row) < expected_columns:
|
||||
row = list(row) + [None] * (expected_columns - len(row))
|
||||
|
||||
# Prepare the record dynamically based on headers
|
||||
# Reset flags for the current row
|
||||
self.default_set = False
|
||||
corrected = False
|
||||
defaulted_columns = []
|
||||
corrected_columns = []
|
||||
record = {}
|
||||
|
||||
for col_idx, column_name in enumerate(headers):
|
||||
original_value = row[col_idx] if col_idx < len(row) else None
|
||||
expected_type = self.get_expected_type(column_name)
|
||||
|
||||
# Call _clean_value dynamically with the correct column_name
|
||||
try:
|
||||
cleaned_value = self._clean_value(
|
||||
# Call `_clean_value` to clean the value and extract
|
||||
# cleaning-related indicators
|
||||
cleaned_value, default_applied = self._clean_value(
|
||||
original_value, expected_type, column_name
|
||||
)
|
||||
|
||||
# Check if the cleaned value is meaningfully different from the
|
||||
# original value
|
||||
is_corrected = cleaned_value != original_value
|
||||
|
||||
# Append column to corrected columns only if the value was corrected
|
||||
if is_corrected:
|
||||
corrected = True
|
||||
corrected_columns.append(column_name)
|
||||
|
||||
# Track default columns separately if a default was applied
|
||||
if default_applied:
|
||||
corrected = True
|
||||
defaulted_columns.append(column_name)
|
||||
|
||||
# Update the record with cleaned value (store only the cleaned part,
|
||||
# not the tuple)
|
||||
record[column_name] = cleaned_value
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.error(
|
||||
f"Validation error for row {index + 4},"
|
||||
f" column '{column_name}': "
|
||||
f"{str(e)}"
|
||||
f"Validation error for row {index + 4}"
|
||||
f", column '{column_name}': {str(e)}"
|
||||
)
|
||||
errors.append(
|
||||
{
|
||||
@ -234,63 +256,71 @@ class SampleSpreadsheetImporter:
|
||||
}
|
||||
)
|
||||
|
||||
# Build metadata for the row
|
||||
raw_data.append(
|
||||
{
|
||||
"row_num": index + 4,
|
||||
"data": list(row), # Original data
|
||||
"default_set": bool(
|
||||
defaulted_columns
|
||||
), # True if any defaults were applied
|
||||
"corrected": corrected, # True if any value was corrected
|
||||
# List of corrected columns (if any)
|
||||
"corrected_columns": corrected_columns,
|
||||
# List of defaulted columns (if any)
|
||||
"defaulted_columns": defaulted_columns,
|
||||
}
|
||||
)
|
||||
|
||||
# Nested processing for data_collection_parameters
|
||||
record["data_collection_parameters"] = {
|
||||
"directory": record.get("directory"),
|
||||
"oscillation": record.get("oscillation"),
|
||||
"aperture": record.get("aperture"),
|
||||
"exposure": record.get("exposure"),
|
||||
"totalrange": record.get("totalrange"),
|
||||
"transmission": record.get("transmission"),
|
||||
"dose": record.get("dose"),
|
||||
"targetresolution": record.get("targetresolution"),
|
||||
"datacollectiontype": record.get("datacollectiontype"),
|
||||
"processingpipeline": record.get("processingpipeline"),
|
||||
"spacegroupnumber": record.get("spacegroupnumber"),
|
||||
"cellparameters": record.get("cellparameters"),
|
||||
"rescutkey": record.get("rescutkey"),
|
||||
"rescutvalue": record.get("rescutvalue"),
|
||||
"userresolution": record.get("userresolution"),
|
||||
"pdbid": record.get("pdbid"),
|
||||
"autoprocfull": record.get("autoprocfull"),
|
||||
"procfull": record.get("procfull"),
|
||||
"adpenabled": record.get("adpenabled"),
|
||||
"noano": record.get("noano"),
|
||||
"ffcscampaign": record.get("ffcscampaign"),
|
||||
"trustedhigh": record.get("trustedhigh"),
|
||||
"autoprocextraparams": record.get("autoprocextraparams"),
|
||||
"chiphiangles": record.get("chiphiangles"),
|
||||
"directory": record.get("directory", ""),
|
||||
"oscillation": record.get("oscillation", 0.0),
|
||||
"aperture": record.get("aperture", None),
|
||||
"exposure": record.get("exposure", 0.0),
|
||||
"totalrange": record.get("totalrange", 0),
|
||||
"transmission": record.get("transmission", 0),
|
||||
"dose": record.get("dose", None),
|
||||
"targetresolution": record.get("targetresolution", 0.0),
|
||||
"datacollectiontype": record.get("datacollectiontype", None),
|
||||
"processingpipeline": record.get("processingpipeline", None),
|
||||
"spacegroupnumber": record.get("spacegroupnumber", None),
|
||||
"cellparameters": record.get("cellparameters", None),
|
||||
"rescutkey": record.get("rescutkey", None),
|
||||
"rescutvalue": record.get("rescutvalue", 0.0),
|
||||
"userresolution": record.get("userresolution", 0.0),
|
||||
"pdbid": record.get("pdbid", ""),
|
||||
"autoprocfull": record.get("autoprocfull", False),
|
||||
"procfull": record.get("procfull", False),
|
||||
"adpenabled": record.get("adpenabled", False),
|
||||
"noano": record.get("noano", False),
|
||||
"ffcscampaign": record.get("ffcscampaign", False),
|
||||
"trustedhigh": record.get("trustedhigh", 0.0),
|
||||
"autoprocextraparams": record.get("autoprocextraparams", None),
|
||||
"chiphiangles": record.get("chiphiangles", 0.0),
|
||||
}
|
||||
|
||||
try:
|
||||
# Validate the record
|
||||
validated_record = SpreadsheetModel(**record)
|
||||
|
||||
# Add validated record to the model
|
||||
model.append(validated_record)
|
||||
except ValidationError as e:
|
||||
logger.error(f"Validation error in row {index + 4}: {e}")
|
||||
for error in e.errors():
|
||||
field_path = error["loc"]
|
||||
msg = error["msg"]
|
||||
|
||||
if field_path[0] == "data_collection_parameters":
|
||||
subfield = field_path[1]
|
||||
column_index = headers.index(subfield)
|
||||
else:
|
||||
field = field_path[0]
|
||||
column_index = headers.index(field)
|
||||
column_name = headers[field_path[0]]
|
||||
|
||||
error_info = {
|
||||
"row": index + 4,
|
||||
"cell": column_index,
|
||||
"value": row[column_index],
|
||||
"column": column_name,
|
||||
"value": row[col_idx],
|
||||
"message": msg,
|
||||
}
|
||||
errors.append(error_info)
|
||||
|
||||
self.model = model
|
||||
logger.info(
|
||||
f"Finished processing {len(model)} records with {len(errors)} errors"
|
||||
)
|
||||
|
||||
return self.model, errors, raw_data, headers # Include headers in the response
|
||||
|
Reference in New Issue
Block a user