The peer-liveness timeout only catches a *silent* writer. A misbehaving writer that keeps sending BUSY heartbeats while never draining (e.g. a permanently wedged filesystem) would otherwise block SendAll -- and, through it, the queued SendImage path and the end-of-run frame_transformation_futures.get() -- forever. Add a progress-based cap in SendAll: if no bytes leave the socket for max_backpressure_timeout (default 60s, tunable via SetMaxBackpressureTimeout) the connection is declared dead regardless of heartbeats. It is one global cap, enforced everywhere SendAll runs, so it bounds both mid-run stalls and finalization. Generous relative to the 15s liveness window, since a heartbeating peer is given more grace than a silent one -- but finite. Add TCPImageCommTest_WedgedWriter_DroppedByBackpressureCap: a writer that ACKs START then stalls forever while heartbeating (cap 1.5s, liveness 5s) must have its connection dropped, and neither the producers nor EndDataCollection may hang. Verified to hang (timeout) with the cap disabled. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
194 lines
8.6 KiB
C++
194 lines
8.6 KiB
C++
// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#pragma once
|
|
|
|
#include <atomic>
|
|
#include <future>
|
|
#include <mutex>
|
|
#include <optional>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <condition_variable>
|
|
|
|
#include "ImagePusher.h"
|
|
#include "ZMQWriterNotificationPuller.h"
|
|
#include "../common/ThreadSafeFIFO.h"
|
|
#include "../common/Logger.h"
|
|
#include "../common/JfjochTCP.h"
|
|
#include "../frame_serialize/CBORStream2Serializer.h"
|
|
|
|
/// TCP-based image stream pusher with persistent connection pool.
|
|
///
|
|
/// Threading model:
|
|
/// - AcceptorThread: accepts new TCP connections, holds connections_mutex briefly
|
|
/// - KeepaliveThread: sends periodic keepalive frames when idle (skipped during data collection)
|
|
/// - Per-connection WriterThread: drains the connection's queue, sends DATA frames
|
|
/// - Per-connection PersistentAckThread: reads ACKs and keepalive pongs from the peer
|
|
///
|
|
/// Lock ordering: connections_mutex → send_mutex → ack_mutex
|
|
/// IMPORTANT: Never call blocking queue operations while holding connections_mutex.
|
|
///
|
|
/// Concurrency contract:
|
|
/// - StartDataCollection, EndDataCollection, SendCalibration, and Finalize
|
|
/// are called from a single control thread in a serialized manner.
|
|
/// - SendImage may be called concurrently from multiple threads between
|
|
/// StartDataCollection and EndDataCollection.
|
|
/// - SendCalibration is called between StartDataCollection and SendImage calls.
|
|
/// - GetConnectedWriters, GetImagesWritten, and PrintSetup are safe to call at any time.
|
|
|
|
class TCPStreamPusher : public ImagePusher {
|
|
struct Connection {
|
|
explicit Connection(size_t queue_size) : queue(queue_size) {}
|
|
|
|
std::atomic<int> fd{-1};
|
|
uint32_t socket_number = 0;
|
|
std::atomic<bool> active{false}; // data-collection threads running
|
|
std::atomic<bool> broken{false};
|
|
std::atomic<bool> connected{false}; // persistent connection is alive
|
|
|
|
ThreadSafeFIFO<ImagePusherQueueElement> queue;
|
|
std::future<void> writer_future;
|
|
|
|
// Persistent ack/keepalive reader (runs as long as the connection is alive)
|
|
std::future<void> persistent_ack_future;
|
|
|
|
std::mutex send_mutex;
|
|
std::mutex ack_mutex;
|
|
std::condition_variable ack_cv;
|
|
|
|
bool start_ack_received = false;
|
|
bool start_ack_ok = false;
|
|
bool end_ack_received = false;
|
|
bool end_ack_ok = false;
|
|
bool cancel_ack_received = false;
|
|
bool cancel_ack_ok = false;
|
|
|
|
std::string last_ack_error;
|
|
std::atomic<TCPAckCode> last_ack_code{TCPAckCode::None};
|
|
|
|
// Soft writer failure reported via DATA ACK (do not break stream on this alone)
|
|
std::atomic<bool> data_ack_error_reported{false};
|
|
std::string data_ack_error_text;
|
|
|
|
std::atomic<uint64_t> data_acked_ok{0};
|
|
std::atomic<uint64_t> data_acked_bad{0};
|
|
std::atomic<uint64_t> data_acked_total{0};
|
|
std::atomic<uint64_t> last_ack_fifo_occupancy{0};
|
|
|
|
std::chrono::steady_clock::time_point last_keepalive_sent{};
|
|
std::chrono::steady_clock::time_point last_keepalive_recv{};
|
|
|
|
// Last time ANY frame (ACK / keepalive pong / busy heartbeat) was received from
|
|
// the peer, as steady_clock nanoseconds. Used to keep a healthy-but-busy writer
|
|
// alive through long backpressure while still detecting a genuinely dead peer.
|
|
std::atomic<int64_t> last_peer_activity_ns{0};
|
|
};
|
|
|
|
std::string endpoint;
|
|
size_t max_connections;
|
|
std::optional<int32_t> send_buffer_size;
|
|
size_t send_queue_size = 128;
|
|
|
|
// Persistent connection pool, guarded by connections_mutex.
|
|
// IMPORTANT: never call PutBlocking/GetBlocking on a queue while holding this mutex.
|
|
mutable std::mutex connections_mutex;
|
|
std::vector<std::shared_ptr<Connection>> connections;
|
|
std::vector<std::shared_ptr<Connection>> session_connections;
|
|
std::shared_ptr<Connection> calibration_connection;
|
|
|
|
// Acceptor thread state
|
|
std::atomic<int> listen_fd{-1};
|
|
std::atomic<bool> acceptor_running{false};
|
|
std::future<void> acceptor_future;
|
|
std::future<void> keepalive_future;
|
|
|
|
std::chrono::milliseconds send_poll_timeout{250};
|
|
// Maximum time a send (or the post-END ACK wait) may block with NO sign of life from
|
|
// the peer before the connection is declared dead. A busy writer refreshes its liveness
|
|
// every ~250 ms via BUSY heartbeats (and via DATA ACKs), so genuine backpressure of any
|
|
// duration is tolerated; only a truly silent (frozen/dead) peer trips this.
|
|
std::chrono::milliseconds peer_liveness_timeout{15000};
|
|
// Hard upper bound on backpressure: if the socket accepts no bytes for this long the
|
|
// writer is wedged and is declared dead even if it keeps heartbeating, so a
|
|
// misbehaving writer cannot block the run (or its finalization) forever. Generous
|
|
// relative to peer_liveness_timeout, since a heartbeating peer is given more grace
|
|
// than a silent one — but still finite.
|
|
std::chrono::milliseconds max_backpressure_timeout{60000};
|
|
|
|
int64_t images_per_file = 1;
|
|
uint64_t run_number = 0;
|
|
std::string run_name;
|
|
std::atomic<bool> transmission_error = false;
|
|
std::atomic<bool> data_collection_active{false};
|
|
|
|
std::atomic<uint64_t> total_data_acked_ok{0};
|
|
std::atomic<uint64_t> total_data_acked_bad{0};
|
|
std::atomic<uint64_t> total_data_acked_total{0};
|
|
|
|
Logger logger{"TCPStreamPusher"};
|
|
|
|
static std::pair<std::string, std::optional<uint16_t>> ParseTcpAddress(const std::string& addr);
|
|
static std::pair<int, std::string> OpenListenSocket(const std::string& addr);
|
|
static int AcceptOne(int listen_fd, std::chrono::milliseconds timeout);
|
|
|
|
static void CloseFd(std::atomic<int>& fd);
|
|
bool IsConnectionAlive(const Connection& c) const;
|
|
bool SendAll(Connection& c, const void* buf, size_t len);
|
|
bool ReadExact(Connection& c, void* buf, size_t len);
|
|
bool ReadExactPersistent(Connection& c, void* buf, size_t len);
|
|
bool SendFrame(Connection& c, const uint8_t* data, size_t size, TCPFrameType type, int64_t image_number);
|
|
|
|
void WriterThread(Connection* c);
|
|
void PersistentAckThread(Connection* c);
|
|
void AcceptorThread();
|
|
void KeepaliveThread();
|
|
|
|
void SetupNewConnection(int new_fd, uint32_t socket_number);
|
|
void RemoveDeadConnections();
|
|
void TearDownConnection(Connection& c);
|
|
|
|
void StartDataCollectionThreads(Connection& c);
|
|
void StopDataCollectionThreads(Connection& c);
|
|
|
|
bool WaitForAck(Connection& c, TCPFrameType ack_for, std::chrono::milliseconds timeout, std::string* error_text);
|
|
bool WaitForEndAck(Connection& c, std::chrono::milliseconds liveness_timeout, std::string* error_text);
|
|
public:
|
|
explicit TCPStreamPusher(const std::string& addr,
|
|
size_t in_max_connections,
|
|
std::optional<int32_t> in_send_buffer_size = {});
|
|
|
|
~TCPStreamPusher() override;
|
|
|
|
/// Max time a send may block on backpressure with no sign of life from the peer
|
|
/// before the connection is declared dead. A busy-but-alive writer keeps it fresh
|
|
/// via BUSY heartbeats, so this only catches a genuinely silent peer. Must be set
|
|
/// before data collection starts.
|
|
void SetPeerLivenessTimeout(std::chrono::milliseconds t) { peer_liveness_timeout = t; }
|
|
|
|
/// Hard upper bound on backpressure. Even while the peer keeps heartbeating, if no
|
|
/// bytes can be sent for this long the writer is declared dead so a wedged writer
|
|
/// cannot block the run or its finalization forever. Must be set before data
|
|
/// collection starts.
|
|
void SetMaxBackpressureTimeout(std::chrono::milliseconds t) { max_backpressure_timeout = t; }
|
|
|
|
std::vector<std::string> GetAddress() const override { return {endpoint}; }
|
|
|
|
/// Returns the number of currently connected writers (can be called at any time)
|
|
size_t GetConnectedWriters() const override;
|
|
|
|
void StartDataCollection(StartMessage& message) override;
|
|
bool EndDataCollection(const EndMessage& message) override;
|
|
bool SendImage(const uint8_t *image_data, size_t image_size, int64_t image_number) override;
|
|
bool SendImage(ZeroCopyReturnValue &z) override;
|
|
bool SendCalibration(const CompressedImage& message) override;
|
|
|
|
std::string Finalize() override;
|
|
std::string PrintSetup() const override;
|
|
|
|
std::optional<uint64_t> GetImagesWritten() const override;
|
|
std::optional<uint64_t> GetImagesWriteError() const override;
|
|
std::vector<int64_t> GetWriterFifoUtilization() const override;
|
|
ImagePusherType GetType() const override { return ImagePusherType::TCP; }
|
|
};
|