StreamWriter: Opus-provided improvements to improve resilience for errors
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 12m17s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m22s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 16m11s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 16m41s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m3s
Build Packages / build:rpm (rocky8) (push) Successful in 18m10s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 19m15s
Build Packages / build:rpm (rocky9) (push) Successful in 19m17s
Build Packages / Generate python client (push) Successful in 1m46s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 9m9s
Build Packages / Create release (push) Has been skipped
Build Packages / Build documentation (push) Successful in 2m0s
Build Packages / DIALS test (push) Failing after 9m1s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 10m43s
Build Packages / XDS test (JFJoch plugin) (push) Failing after 9m9s
Build Packages / XDS test (durin plugin) (push) Failing after 9m43s
Build Packages / XDS test (neggia plugin) (push) Failing after 8m41s
Build Packages / Unit tests (push) Failing after 56m55s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 12m17s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m22s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 16m11s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 16m41s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m3s
Build Packages / build:rpm (rocky8) (push) Successful in 18m10s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 19m15s
Build Packages / build:rpm (rocky9) (push) Successful in 19m17s
Build Packages / Generate python client (push) Successful in 1m46s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 9m9s
Build Packages / Create release (push) Has been skipped
Build Packages / Build documentation (push) Successful in 2m0s
Build Packages / DIALS test (push) Failing after 9m1s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 10m43s
Build Packages / XDS test (JFJoch plugin) (push) Failing after 9m9s
Build Packages / XDS test (durin plugin) (push) Failing after 9m43s
Build Packages / XDS test (neggia plugin) (push) Failing after 8m41s
Build Packages / Unit tests (push) Failing after 56m55s
This commit is contained in:
+47
-13
@@ -24,10 +24,22 @@ void StreamWriter::NotifyTcpAck(TCPFrameType ack_for, bool ok, bool fatal, TCPAc
|
||||
if (!image_puller.SupportsAck())
|
||||
return;
|
||||
|
||||
// Send only one fatal DATA ack per data collection; subsequent in-flight
|
||||
// DATA frames just get a quiet non-fatal NACK.
|
||||
bool send_fatal = fatal;
|
||||
if (ack_for == TCPFrameType::DATA) {
|
||||
if (fatal && tcp_data_fatal_sent) {
|
||||
send_fatal = false;
|
||||
ok = false;
|
||||
} else if (fatal) {
|
||||
tcp_data_fatal_sent = true;
|
||||
}
|
||||
}
|
||||
|
||||
PullerAckMessage ack;
|
||||
ack.ack_for = ack_for;
|
||||
ack.ok = ok;
|
||||
ack.fatal = fatal;
|
||||
ack.fatal = send_fatal;
|
||||
ack.error_code = code;
|
||||
ack.error_text = error_text;
|
||||
ack.run_number = run_number;
|
||||
@@ -105,7 +117,7 @@ void StreamWriter::ProcessCalibrationImage() {
|
||||
}
|
||||
|
||||
void StreamWriter::ProcessDataImage() {
|
||||
switch (state) {
|
||||
switch (state) {
|
||||
case StreamWriterState::Idle:
|
||||
logger.Warning("Missing meaningful image while waiting for START");
|
||||
mute_data_msg_in_idle = true;
|
||||
@@ -113,7 +125,7 @@ void StreamWriter::ProcessDataImage() {
|
||||
case StreamWriterState::Started:
|
||||
start_time = std::chrono::system_clock::now();
|
||||
state = StreamWriterState::Receiving;
|
||||
// Follow through to receiving - no brake!
|
||||
// Follow through to receiving - no break!
|
||||
case StreamWriterState::Receiving:
|
||||
try {
|
||||
if (verbose)
|
||||
@@ -129,29 +141,44 @@ void StreamWriter::ProcessDataImage() {
|
||||
if (verbose)
|
||||
logger.Info("Written");
|
||||
NotifyTcpAck(TCPFrameType::DATA, true, false, TCPAckCode::None);
|
||||
} catch (const JFJochException &e) {
|
||||
logger.ErrorException(e);
|
||||
logger.Warning("Error writing image - switching to error state");
|
||||
} catch (const std::exception &e) {
|
||||
logger.Error("Error writing image: {}", e.what());
|
||||
logger.Warning("Switching to error state; subsequent DATA frames will be NACKed silently");
|
||||
const bool first_fatal = !tcp_data_fatal_sent;
|
||||
state = StreamWriterState::Error;
|
||||
err = e.what();
|
||||
NotifyTcpAck(TCPFrameType::DATA, false, true, TCPAckCode::DataWriteFailed, err);
|
||||
// Heuristic mapping ENOSPC -> NoSpaceLeft
|
||||
TCPAckCode code = TCPAckCode::DataWriteFailed;
|
||||
std::string what = e.what();
|
||||
if (what.find("no space") != std::string::npos ||
|
||||
what.find("No space") != std::string::npos ||
|
||||
what.find("ENOSPC") != std::string::npos)
|
||||
code = TCPAckCode::NoSpaceLeft;
|
||||
NotifyTcpAck(TCPFrameType::DATA, false, first_fatal, code, err);
|
||||
}
|
||||
break;
|
||||
case StreamWriterState::Error:
|
||||
// Error state => Wait till end only
|
||||
// Quiet non-fatal NACK so the producer knows this image wasn't written
|
||||
NotifyTcpAck(TCPFrameType::DATA, false, false, TCPAckCode::DataWriteFailed, err);
|
||||
break;
|
||||
case StreamWriterState::Finalized:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void StreamWriter::ProcessEndMessage() {
|
||||
// Ignore end message when idle state!
|
||||
if (state == StreamWriterState::Idle || state == StreamWriterState::Finalized)
|
||||
return;
|
||||
|
||||
if (verbose)
|
||||
logger.Info("Received end message");
|
||||
|
||||
// Important: close DATA files BEFORE finalizing the master, so that the
|
||||
// master's links/VDS reference renamed final filenames (and so that we
|
||||
// don't keep writing to a master if data-file finalize is going to fail
|
||||
// anyway on ENOSPC).
|
||||
FinalizeDataCollection();
|
||||
|
||||
if (state != StreamWriterState::Error) {
|
||||
try {
|
||||
if ((image_puller_output.cbor->end_message->max_image_number == 0) && (max_image_number > 0))
|
||||
@@ -165,9 +192,12 @@ void StreamWriter::ProcessEndMessage() {
|
||||
}
|
||||
}
|
||||
|
||||
FinalizeDataCollection();
|
||||
// Drop file_writer either way; the master tmp is unlinked in NXmx::~NXmx
|
||||
// / NXmx::Finalize on the error path.
|
||||
file_writer.reset();
|
||||
|
||||
const bool error_state = (state == StreamWriterState::Error);
|
||||
state = StreamWriterState::Finalized;
|
||||
|
||||
NotifyReceiverOnFinalizedWrite(writer_notification_zmq_addr);
|
||||
NotifyTcpAck(TCPFrameType::END, !error_state, error_state,
|
||||
@@ -181,18 +211,22 @@ void StreamWriter::FinalizeDataCollection() {
|
||||
if (file_writer && (state != StreamWriterState::Error)) {
|
||||
try {
|
||||
hdf5_data_file_statistics = file_writer->Finalize();
|
||||
} catch (JFJochException &e) {
|
||||
} catch (const std::exception &e) {
|
||||
state = StreamWriterState::Error;
|
||||
err = e.what();
|
||||
logger.ErrorException(e);
|
||||
logger.Error("Error finalizing writing - switching to error state");
|
||||
}
|
||||
} else if (file_writer && state == StreamWriterState::Error) {
|
||||
// Best-effort cleanup: still call Finalize so per-file Close() runs and
|
||||
// unlinks the tmp files even when broken.
|
||||
try { (void) file_writer->Finalize(); } catch (...) {}
|
||||
hdf5_data_file_statistics.clear();
|
||||
} else {
|
||||
hdf5_data_file_statistics.clear();
|
||||
}
|
||||
file_writer.reset();
|
||||
logger.Info("Data writing finished");
|
||||
state = StreamWriterState::Finalized;
|
||||
// Don't reset file_writer here: ProcessEndMessage still needs it for the master.
|
||||
}
|
||||
|
||||
void StreamWriter::CollectImages() {
|
||||
|
||||
Reference in New Issue
Block a user