TCPStreamPusher: post-zerocopy cleanup + fix queue-path backpressure drop
Follow-up simplifications after removing the zerocopy machinery, plus a real backpressure bug the cleanup surfaced: - SendImage(ZeroCopyReturnValue&) imposed a hard 2s deadline on enqueueing and then marked the connection broken. At high frame rate the 128-deep queue fills in tens of ms, so any filesystem stall longer than ~2s dropped the run even though the writer was alive and heartbeating -- defeating the whole BUSY-heartbeat backpressure design. Block instead while the peer is alive (!broken && active); the real liveness decision already lives in SendAll's peer-liveness timeout, which the writer's BUSY heartbeats keep fresh. This makes the queue path consistent with the send path: both wait out arbitrarily long stalls and only give up when the peer goes genuinely silent. - Drop the dead per-connection data_sent counter (written, never read) and the redundant ImagePusherQueueElement.image_data set on the TCP path (only the HDF5 pusher reads that field). - Add SetPeerLivenessTimeout() so the liveness window is tunable (and testable). Add TCPImageCommTest_StalledWriter_SurvivesViaHeartbeat: a controllable raw writer double connects, ACKs START, then stops draining for 4s while still sending BUSY heartbeats (peer-liveness window set to 2s). The run must ride out the stall on the zero-copy queue path and deliver all 1000 images. Verified to fail (115/1000 delivered, connection dropped) against the old 2s-deadline behavior. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,11 +3,25 @@
|
||||
|
||||
#include <random>
|
||||
#include <future>
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <cstring>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <unistd.h>
|
||||
#include <catch2/catch_all.hpp>
|
||||
|
||||
#include "../image_pusher/TCPStreamPusher.h"
|
||||
#include "../image_puller/TCPImagePuller.h"
|
||||
#include "../image_puller/ZMQImagePuller.h"
|
||||
#include "../common/ImageBuffer.h"
|
||||
#include "../common/ZeroCopyReturnValue.h"
|
||||
|
||||
TEST_CASE("TCPImageCommTest_2Writers_WithAck", "[TCP]") {
|
||||
const size_t nframes = 128;
|
||||
@@ -698,3 +712,221 @@ TEST_CASE("TCPImageCommTest_RepubToZMQ", "[TCP][ZeroMQ]") {
|
||||
REQUIRE(zmq_nimages == nframes);
|
||||
REQUIRE(zmq_errors == 0);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// Controllable TCP "writer" peer for backpressure tests. Connects to the pusher, ACKs
|
||||
// START, then *stalls* (stops draining the socket) until Release() is called, while a
|
||||
// background thread keeps sending BUSY heartbeats — i.e. a writer that is alive but
|
||||
// wedged (e.g. on a slow filesystem at high frame rate). Catch2 assertion macros are not
|
||||
// thread-safe, so the worker threads only touch atomics; the test thread asserts.
|
||||
class StallableWriterDouble {
|
||||
public:
|
||||
StallableWriterDouble(const std::string &tcp_addr, int rcvbuf_bytes) {
|
||||
auto [host, port] = ParseHostPort(tcp_addr);
|
||||
fd_ = ::socket(AF_INET, SOCK_STREAM, 0);
|
||||
if (fd_ < 0)
|
||||
return;
|
||||
setsockopt(fd_, SOL_SOCKET, SO_RCVBUF, &rcvbuf_bytes, sizeof(rcvbuf_bytes));
|
||||
sockaddr_in sin{};
|
||||
sin.sin_family = AF_INET;
|
||||
sin.sin_port = htons(port);
|
||||
inet_pton(AF_INET, host.c_str(), &sin.sin_addr);
|
||||
if (::connect(fd_, reinterpret_cast<sockaddr *>(&sin), sizeof(sin)) != 0) {
|
||||
::close(fd_);
|
||||
fd_ = -1;
|
||||
return;
|
||||
}
|
||||
busy_thread_ = std::thread([this] { BusyLoop(); });
|
||||
reader_thread_ = std::thread([this] { ReaderLoop(); });
|
||||
}
|
||||
|
||||
~StallableWriterDouble() {
|
||||
stop_ = true;
|
||||
Release();
|
||||
if (fd_ >= 0)
|
||||
::shutdown(fd_, SHUT_RDWR);
|
||||
if (reader_thread_.joinable())
|
||||
reader_thread_.join();
|
||||
if (busy_thread_.joinable())
|
||||
busy_thread_.join();
|
||||
if (fd_ >= 0)
|
||||
::close(fd_);
|
||||
}
|
||||
|
||||
[[nodiscard]] bool Connected() const { return fd_ >= 0; }
|
||||
|
||||
// Stop stalling: let the reader drain DATA and ACK END.
|
||||
void Release() {
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(mtx_);
|
||||
released_ = true;
|
||||
}
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
[[nodiscard]] size_t DataFramesReceived() const { return data_frames_.load(); }
|
||||
[[nodiscard]] bool EndAcked() const { return end_acked_.load(); }
|
||||
|
||||
private:
|
||||
static std::pair<std::string, uint16_t> ParseHostPort(const std::string &addr) {
|
||||
const std::string prefix = "tcp://";
|
||||
const auto hp = addr.substr(prefix.size());
|
||||
const auto p = hp.find_last_of(':');
|
||||
return {hp.substr(0, p), static_cast<uint16_t>(std::stoi(hp.substr(p + 1)))};
|
||||
}
|
||||
|
||||
bool SendHeader(TCPFrameType type, TCPFrameType ack_for, uint64_t run, uint32_t sock, uint32_t flags) {
|
||||
TcpFrameHeader h{};
|
||||
h.type = static_cast<uint16_t>(type);
|
||||
h.ack_for = static_cast<uint16_t>(ack_for);
|
||||
h.run_number = run;
|
||||
h.socket_number = sock;
|
||||
h.flags = flags;
|
||||
h.payload_size = 0;
|
||||
std::lock_guard<std::mutex> lg(send_mtx_);
|
||||
if (fd_ < 0)
|
||||
return false;
|
||||
return ::send(fd_, &h, sizeof(h), MSG_NOSIGNAL) == static_cast<ssize_t>(sizeof(h));
|
||||
}
|
||||
|
||||
bool ReadExact(void *buf, size_t len) {
|
||||
auto *p = static_cast<uint8_t *>(buf);
|
||||
size_t got = 0;
|
||||
while (got < len) {
|
||||
const ssize_t rc = ::recv(fd_, p + got, len - got, 0);
|
||||
if (rc <= 0)
|
||||
return false;
|
||||
got += static_cast<size_t>(rc);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BusyLoop() {
|
||||
// Heartbeat keeps the pusher's peer-liveness fresh even while we are not draining.
|
||||
while (!stop_) {
|
||||
SendHeader(TCPFrameType::BUSY, TCPFrameType::DATA, run_.load(), sock_.load(), 0);
|
||||
for (int i = 0; i < 5 && !stop_; i++)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
||||
}
|
||||
}
|
||||
|
||||
void ReaderLoop() {
|
||||
std::vector<uint8_t> discard;
|
||||
while (!stop_) {
|
||||
TcpFrameHeader h{};
|
||||
if (!ReadExact(&h, sizeof(h)))
|
||||
return;
|
||||
if (h.magic != JFJOCH_TCP_MAGIC || h.version != JFJOCH_TCP_VERSION)
|
||||
return;
|
||||
if (h.payload_size > 0) {
|
||||
discard.resize(h.payload_size);
|
||||
if (!ReadExact(discard.data(), discard.size()))
|
||||
return;
|
||||
}
|
||||
switch (static_cast<TCPFrameType>(h.type)) {
|
||||
case TCPFrameType::START:
|
||||
run_.store(h.run_number);
|
||||
sock_.store(h.socket_number);
|
||||
SendHeader(TCPFrameType::ACK, TCPFrameType::START, h.run_number, h.socket_number, TCP_ACK_FLAG_OK);
|
||||
{ // Stall: stop reading until released.
|
||||
std::unique_lock<std::mutex> ul(mtx_);
|
||||
cv_.wait(ul, [this] { return released_ || stop_; });
|
||||
}
|
||||
break;
|
||||
case TCPFrameType::DATA:
|
||||
data_frames_.fetch_add(1);
|
||||
break;
|
||||
case TCPFrameType::END:
|
||||
SendHeader(TCPFrameType::ACK, TCPFrameType::END, h.run_number, h.socket_number, TCP_ACK_FLAG_OK);
|
||||
end_acked_.store(true);
|
||||
return;
|
||||
default:
|
||||
break; // ignore KEEPALIVE etc.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int fd_ = -1;
|
||||
std::thread reader_thread_;
|
||||
std::thread busy_thread_;
|
||||
std::atomic<bool> stop_{false};
|
||||
std::atomic<uint64_t> run_{0};
|
||||
std::atomic<uint32_t> sock_{0};
|
||||
std::atomic<size_t> data_frames_{0};
|
||||
std::atomic<bool> end_acked_{false};
|
||||
std::mutex send_mtx_;
|
||||
std::mutex mtx_;
|
||||
std::condition_variable cv_;
|
||||
bool released_ = false;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST_CASE("TCPImageCommTest_StalledWriter_SurvivesViaHeartbeat", "[TCP]") {
|
||||
// A writer that is alive (still heartbeating) but has stopped draining — e.g. wedged
|
||||
// on a slow filesystem at high frame rate — must NOT be dropped mid-run. The pusher
|
||||
// rides out the backpressure on the production zero-copy queue path until the writer
|
||||
// recovers. Regression for the queue-path send giving up on a fixed deadline, and for
|
||||
// the BUSY heartbeat keeping the connection alive past the peer-liveness window.
|
||||
constexpr int64_t N = 1000; // > queue depth + socket buffers
|
||||
constexpr auto liveness = std::chrono::milliseconds(2000);
|
||||
constexpr auto stall = std::chrono::milliseconds(4000); // > liveness AND > old send deadline
|
||||
|
||||
// Small SO_SNDBUF/SO_RCVBUF so backpressure reaches the queue after few images.
|
||||
TCPStreamPusher pusher("tcp://127.0.0.1:*", 1, 16 * 1024);
|
||||
pusher.SetPeerLivenessTimeout(liveness);
|
||||
|
||||
StallableWriterDouble writer(pusher.GetAddress()[0], 16 * 1024);
|
||||
REQUIRE(writer.Connected());
|
||||
|
||||
for (int attempt = 0; attempt < 200 && pusher.GetConnectedWriters() < 1; ++attempt)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(25));
|
||||
REQUIRE(pusher.GetConnectedWriters() == 1);
|
||||
|
||||
ImageBuffer image_buffer(16 * 1024 * 1024);
|
||||
image_buffer.StartMeasurement(static_cast<size_t>(4096));
|
||||
|
||||
StartMessage start{.images_per_file = 1000, .write_master_file = true};
|
||||
pusher.StartDataCollection(start); // writer ACKs START, then stalls (stops reading)
|
||||
|
||||
auto sender = std::async(std::launch::async, [&] {
|
||||
for (int64_t i = 0; i < N; i++) {
|
||||
ZeroCopyReturnValue *slot = nullptr;
|
||||
while ((slot = image_buffer.GetImageSlot()) == nullptr)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
std::memset(slot->GetImage(), 0, 256);
|
||||
slot->SetImageNumber(i);
|
||||
slot->SetImageSize(256); // arbitrary payload; the writer double discards it
|
||||
slot->ReadyToSend();
|
||||
pusher.SendImage(*slot);
|
||||
}
|
||||
});
|
||||
|
||||
// During the stall the queue is full; SendImage must block, not drop the connection.
|
||||
std::this_thread::sleep_for(stall);
|
||||
CHECK(pusher.GetConnectedWriters() == 1);
|
||||
CHECK(sender.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready);
|
||||
|
||||
// Writer recovers and starts draining.
|
||||
writer.Release();
|
||||
|
||||
REQUIRE(sender.wait_for(std::chrono::seconds(30)) == std::future_status::ready);
|
||||
sender.get();
|
||||
|
||||
// Every image makes it across once the stall clears.
|
||||
for (int attempt = 0; attempt < 1200 && writer.DataFramesReceived() < static_cast<size_t>(N); ++attempt)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(25));
|
||||
CHECK(writer.DataFramesReceived() == static_cast<size_t>(N));
|
||||
|
||||
// Queue fully drained: END now hands over cleanly without racing data frames.
|
||||
EndMessage end{};
|
||||
CHECK(pusher.EndDataCollection(end) == true);
|
||||
|
||||
for (int attempt = 0; attempt < 200 && !writer.EndAcked(); ++attempt)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(25));
|
||||
CHECK(writer.EndAcked());
|
||||
CHECK(pusher.GetConnectedWriters() == 1);
|
||||
|
||||
image_buffer.Finalize(std::chrono::seconds(5));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user