Add job cancellation handling and periodic cleanup logic

Introduce new statuses, "to_cancel" and "cancelled", to improve job state tracking. Implement logic to nullify `slurm_id` for cancelled jobs and a background thread to clean up cancelled jobs older than 2 hours. Ensure periodic cleanup runs hourly to maintain database hygiene.
This commit is contained in:
GotthardG 2025-05-01 15:17:42 +02:00
parent 9e875c5a04
commit a1b857b78a
3 changed files with 46 additions and 0 deletions

View File

@ -313,6 +313,8 @@ class JobStatus(str, enum.Enum):
TODO = "todo"
SUBMITTED = "submitted"
DONE = "done"
TO_CANCEL = "to_cancel"
CANCELLED = "cancelled"
FAILED = "failed"

View File

@ -80,6 +80,10 @@ def update_jobs_status(payload: JobsUpdate, db: Session = Depends(get_db)):
raise HTTPException(status_code=404, detail="Job not found")
# If status is being updated to "cancelled"
if payload.status == "cancelled":
job.slurm_id = None
# Update the status
job.status = payload.status
job.slurm_id = payload.slurm_id
@ -93,3 +97,23 @@ def update_jobs_status(payload: JobsUpdate, db: Session = Depends(get_db)):
# Return the updated job's info as response
return JobsUpdate(job_id=job.id, status=job.status, slurm_id=job.slurm_id)
def cleanup_cancelled_jobs(db: Session):
from datetime import datetime
from datetime import timedelta
"""Delete jobs in 'cancelled' state for more than 2 hours."""
cutoff = datetime.now() - timedelta(hours=2)
print(
f"Cleaning up cancelled jobs older than {cutoff} "
f"(current time: {datetime.now()})"
)
old_jobs = (
db.query(JobModel)
.filter(JobModel.status == "cancelled", JobModel.updated_at < cutoff)
.all()
)
for job in old_jobs:
db.delete(job)
db.commit()

View File

@ -122,6 +122,20 @@ if environment == "dev":
ssl_heidi.generate_self_signed_cert(cert_path, key_path)
def cleanup_job_loop():
import time
from app.dependencies import get_db
from app.routers.processing import cleanup_cancelled_jobs
while True:
db = next(get_db())
try:
cleanup_cancelled_jobs(db)
finally:
db.close()
time.sleep(3600) # every hour
@asynccontextmanager
async def lifespan(app: FastAPI):
print("[INFO] Running application startup tasks...")
@ -174,6 +188,12 @@ async def lifespan(app: FastAPI):
load_slots_data(db)
from threading import Thread
# Start cleanup in background thread
thread = Thread(target=cleanup_job_loop, daemon=True)
thread.start()
yield
finally:
db.close()