StreamWriter: Opus-provided improvements to improve resilience for errors
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 12m17s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m22s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 16m11s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 16m41s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m3s
Build Packages / build:rpm (rocky8) (push) Successful in 18m10s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 19m15s
Build Packages / build:rpm (rocky9) (push) Successful in 19m17s
Build Packages / Generate python client (push) Successful in 1m46s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 9m9s
Build Packages / Create release (push) Has been skipped
Build Packages / Build documentation (push) Successful in 2m0s
Build Packages / DIALS test (push) Failing after 9m1s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 10m43s
Build Packages / XDS test (JFJoch plugin) (push) Failing after 9m9s
Build Packages / XDS test (durin plugin) (push) Failing after 9m43s
Build Packages / XDS test (neggia plugin) (push) Failing after 8m41s
Build Packages / Unit tests (push) Failing after 56m55s

This commit is contained in:
2026-05-04 11:44:46 +02:00
parent 0a2c370983
commit f7a9e4eab1
7 changed files with 277 additions and 109 deletions
+47 -13
View File
@@ -24,10 +24,22 @@ void StreamWriter::NotifyTcpAck(TCPFrameType ack_for, bool ok, bool fatal, TCPAc
if (!image_puller.SupportsAck())
return;
// Send only one fatal DATA ack per data collection; subsequent in-flight
// DATA frames just get a quiet non-fatal NACK.
bool send_fatal = fatal;
if (ack_for == TCPFrameType::DATA) {
if (fatal && tcp_data_fatal_sent) {
send_fatal = false;
ok = false;
} else if (fatal) {
tcp_data_fatal_sent = true;
}
}
PullerAckMessage ack;
ack.ack_for = ack_for;
ack.ok = ok;
ack.fatal = fatal;
ack.fatal = send_fatal;
ack.error_code = code;
ack.error_text = error_text;
ack.run_number = run_number;
@@ -105,7 +117,7 @@ void StreamWriter::ProcessCalibrationImage() {
}
void StreamWriter::ProcessDataImage() {
switch (state) {
switch (state) {
case StreamWriterState::Idle:
logger.Warning("Missing meaningful image while waiting for START");
mute_data_msg_in_idle = true;
@@ -113,7 +125,7 @@ void StreamWriter::ProcessDataImage() {
case StreamWriterState::Started:
start_time = std::chrono::system_clock::now();
state = StreamWriterState::Receiving;
// Follow through to receiving - no brake!
// Follow through to receiving - no break!
case StreamWriterState::Receiving:
try {
if (verbose)
@@ -129,29 +141,44 @@ void StreamWriter::ProcessDataImage() {
if (verbose)
logger.Info("Written");
NotifyTcpAck(TCPFrameType::DATA, true, false, TCPAckCode::None);
} catch (const JFJochException &e) {
logger.ErrorException(e);
logger.Warning("Error writing image - switching to error state");
} catch (const std::exception &e) {
logger.Error("Error writing image: {}", e.what());
logger.Warning("Switching to error state; subsequent DATA frames will be NACKed silently");
const bool first_fatal = !tcp_data_fatal_sent;
state = StreamWriterState::Error;
err = e.what();
NotifyTcpAck(TCPFrameType::DATA, false, true, TCPAckCode::DataWriteFailed, err);
// Heuristic mapping ENOSPC -> NoSpaceLeft
TCPAckCode code = TCPAckCode::DataWriteFailed;
std::string what = e.what();
if (what.find("no space") != std::string::npos ||
what.find("No space") != std::string::npos ||
what.find("ENOSPC") != std::string::npos)
code = TCPAckCode::NoSpaceLeft;
NotifyTcpAck(TCPFrameType::DATA, false, first_fatal, code, err);
}
break;
case StreamWriterState::Error:
// Error state => Wait till end only
// Quiet non-fatal NACK so the producer knows this image wasn't written
NotifyTcpAck(TCPFrameType::DATA, false, false, TCPAckCode::DataWriteFailed, err);
break;
case StreamWriterState::Finalized:
break;
}
}
void StreamWriter::ProcessEndMessage() {
// Ignore end message when idle state!
if (state == StreamWriterState::Idle || state == StreamWriterState::Finalized)
return;
if (verbose)
logger.Info("Received end message");
// Important: close DATA files BEFORE finalizing the master, so that the
// master's links/VDS reference renamed final filenames (and so that we
// don't keep writing to a master if data-file finalize is going to fail
// anyway on ENOSPC).
FinalizeDataCollection();
if (state != StreamWriterState::Error) {
try {
if ((image_puller_output.cbor->end_message->max_image_number == 0) && (max_image_number > 0))
@@ -165,9 +192,12 @@ void StreamWriter::ProcessEndMessage() {
}
}
FinalizeDataCollection();
// Drop file_writer either way; the master tmp is unlinked in NXmx::~NXmx
// / NXmx::Finalize on the error path.
file_writer.reset();
const bool error_state = (state == StreamWriterState::Error);
state = StreamWriterState::Finalized;
NotifyReceiverOnFinalizedWrite(writer_notification_zmq_addr);
NotifyTcpAck(TCPFrameType::END, !error_state, error_state,
@@ -181,18 +211,22 @@ void StreamWriter::FinalizeDataCollection() {
if (file_writer && (state != StreamWriterState::Error)) {
try {
hdf5_data_file_statistics = file_writer->Finalize();
} catch (JFJochException &e) {
} catch (const std::exception &e) {
state = StreamWriterState::Error;
err = e.what();
logger.ErrorException(e);
logger.Error("Error finalizing writing - switching to error state");
}
} else if (file_writer && state == StreamWriterState::Error) {
// Best-effort cleanup: still call Finalize so per-file Close() runs and
// unlinks the tmp files even when broken.
try { (void) file_writer->Finalize(); } catch (...) {}
hdf5_data_file_statistics.clear();
} else {
hdf5_data_file_statistics.clear();
}
file_writer.reset();
logger.Info("Data writing finished");
state = StreamWriterState::Finalized;
// Don't reset file_writer here: ProcessEndMessage still needs it for the master.
}
void StreamWriter::CollectImages() {