Try settings on sf-daqsync-02

This commit is contained in:
Dominik Werder
2023-09-14 16:55:53 +02:00
parent f869047cf5
commit b21dfae560
16 changed files with 296 additions and 159 deletions

View File

@@ -509,6 +509,7 @@ impl CaConn {
remote_addr_dbg: SocketAddrV4,
local_epics_hostname: String,
channel_info_query_tx: Sender<ChannelInfoQuery>,
stats: Arc<CaConnStats>,
) -> Self {
let (cq_tx, cq_rx) = async_channel::bounded(32);
Self {
@@ -527,7 +528,7 @@ impl CaConn {
insert_item_queue: VecDeque::new(),
remote_addr_dbg,
local_epics_hostname,
stats: Arc::new(CaConnStats::new()),
stats,
insert_ivl_min_mus: 1000 * 6,
conn_command_tx: cq_tx,
conn_command_rx: cq_rx,
@@ -893,24 +894,27 @@ impl CaConn {
fn check_channels_alive(&mut self) -> Result<(), Error> {
let tsnow = Instant::now();
trace2!("check_channels_alive {addr:?}", addr = &self.remote_addr_dbg);
if self.ioc_ping_last.elapsed() > Duration::from_millis(20000) {
if let Some(started) = self.ioc_ping_start {
if started.elapsed() > Duration::from_millis(4000) {
warn!("pong timeout {addr:?}", addr = self.remote_addr_dbg);
let item = CaConnEvent {
ts: Instant::now(),
value: CaConnEventValue::EchoTimeout,
};
self.ca_conn_event_out_queue.push_back(item);
self.trigger_shutdown(ChannelStatusClosedReason::IocTimeout);
}
} else {
self.ioc_ping_start = Some(Instant::now());
if let Some(started) = self.ioc_ping_start {
if started.elapsed() >= Duration::from_millis(4000) {
self.stats.pong_timeout().inc();
self.ioc_ping_start = None;
warn!("pong timeout {addr:?}", addr = self.remote_addr_dbg);
let item = CaConnEvent {
ts: tsnow,
value: CaConnEventValue::EchoTimeout,
};
self.ca_conn_event_out_queue.push_back(item);
self.trigger_shutdown(ChannelStatusClosedReason::IocTimeout);
}
} else {
if self.ioc_ping_last.elapsed() > Duration::from_millis(20000) {
if let Some(proto) = &mut self.proto {
trace2!("ping to {}", self.remote_addr_dbg);
self.stats.ping_start().inc();
self.ioc_ping_start = Some(Instant::now());
let msg = CaMsg { ty: CaMsgTy::Echo };
proto.push_out(msg);
} else {
self.stats.ping_no_proto().inc();
warn!("can not ping {} no proto", self.remote_addr_dbg);
self.trigger_shutdown(ChannelStatusClosedReason::NoProtocol);
}
@@ -1057,6 +1061,7 @@ impl CaConn {
scalar_type: ScalarType,
shape: Shape,
ts: u64,
ts_local: u64,
ev: proto::EventAddRes,
item_queue: &mut VecDeque<QueryItem>,
ts_msp_last: u64,
@@ -1090,6 +1095,7 @@ impl CaConn {
shape,
val: ev.value.data.into(),
ts_msp_grid,
ts_local,
};
item_queue.push_back(QueryItem::Insert(item));
stats.insert_item_create.inc();
@@ -1102,6 +1108,7 @@ impl CaConn {
scalar_type: ScalarType,
shape: Shape,
ts: u64,
ts_local: u64,
ev: proto::EventAddRes,
tsnow: Instant,
item_queue: &mut VecDeque<QueryItem>,
@@ -1136,6 +1143,7 @@ impl CaConn {
scalar_type.clone(),
shape.clone(),
ts - 1 - i as u64,
ts_local - 1 - i as u64,
ev.clone(),
item_queue,
ts_msp_last,
@@ -1150,6 +1158,7 @@ impl CaConn {
scalar_type,
shape,
ts,
ts_local,
ev,
item_queue,
ts_msp_last,
@@ -1265,6 +1274,7 @@ impl CaConn {
scalar_type,
shape,
ts,
ts_local,
ev,
tsnow,
item_queue,
@@ -1504,16 +1514,28 @@ impl CaConn {
}
CaMsgTy::AccessRightsRes(_) => {}
CaMsgTy::Echo => {
let addr = &self.remote_addr_dbg;
// let addr = &self.remote_addr_dbg;
if let Some(started) = self.ioc_ping_start {
let dt = started.elapsed().as_secs_f32() * 1e3;
if dt > 50. {
info!("Received Echo {dt:10.0}ms {addr:?}");
} else if dt > 500. {
warn!("Received Echo {dt:10.0}ms {addr:?}");
if dt <= 10. {
self.stats.pong_recv_010ms().inc();
} else if dt <= 25. {
self.stats.pong_recv_025ms().inc();
} else if dt <= 50. {
self.stats.pong_recv_050ms().inc();
} else if dt <= 100. {
self.stats.pong_recv_100ms().inc();
} else if dt <= 200. {
self.stats.pong_recv_200ms().inc();
} else if dt <= 400. {
self.stats.pong_recv_400ms().inc();
} else {
self.stats.pong_recv_slow().inc();
// warn!("Received Echo {dt:10.0}ms {addr:?}");
}
} else {
info!("Received Echo even though we didn't asked for it {addr:?}");
let addr = &self.remote_addr_dbg;
warn!("Received Echo even though we didn't asked for it {addr:?}");
}
self.ioc_ping_last = Instant::now();
self.ioc_ping_start = None;
@@ -1786,8 +1808,8 @@ impl Stream for CaConn {
let poll_ts1 = Instant::now();
let ret = loop {
let qlen = self.insert_item_queue.len();
if qlen >= 200 {
self.thr_msg_poll.trigger_fmt("CaConn::poll_next", &[&qlen]);
if qlen > self.opts.insert_queue_max / 3 {
self.stats.insert_item_queue_pressure().inc();
}
break if let CaConnState::EndOfStream = self.state {
Ready(None)

View File

@@ -196,6 +196,7 @@ pub struct CaConnSetCtrl {
tx: Sender<CaConnSetEvent>,
rx: Receiver<CaConnSetItem>,
stats: Arc<CaConnSetStats>,
ca_conn_stats: Arc<CaConnStats>,
jh: JoinHandle<Result<(), Error>>,
}
@@ -246,6 +247,10 @@ impl CaConnSetCtrl {
pub fn stats(&self) -> &Arc<CaConnSetStats> {
&self.stats
}
pub fn ca_conn_stats(&self) -> &Arc<CaConnStats> {
&self.ca_conn_stats
}
}
#[derive(Debug)]
@@ -295,6 +300,7 @@ pub struct CaConnSet {
shutdown_done: bool,
chan_check_next: Option<Channel>,
stats: Arc<CaConnSetStats>,
ca_conn_stats: Arc<CaConnStats>,
ioc_finder_jh: JoinHandle<Result<(), Error>>,
await_ca_conn_jhs: VecDeque<(SocketAddr, JoinHandle<Result<(), Error>>)>,
thr_msg_poll_1: ThrottleTrace,
@@ -318,6 +324,10 @@ impl CaConnSet {
super::finder::start_finder(find_ioc_res_tx.clone(), backend.clone(), pgconf);
let (channel_info_res_tx, channel_info_res_rx) = async_channel::bounded(400);
let stats = Arc::new(CaConnSetStats::new());
let ca_conn_stats = Arc::new(CaConnStats::new());
stats.test_1().inc();
stats.test_1().inc();
stats.test_1().inc();
let connset = Self {
backend,
local_epics_hostname,
@@ -342,6 +352,7 @@ impl CaConnSet {
shutdown_done: false,
chan_check_next: None,
stats: stats.clone(),
ca_conn_stats: ca_conn_stats.clone(),
connset_out_tx,
connset_out_queue: VecDeque::new(),
// connset_out_sender: SenderPolling::new(connset_out_tx),
@@ -357,6 +368,7 @@ impl CaConnSet {
tx: connset_inp_tx,
rx: connset_out_rx,
stats,
ca_conn_stats,
jh,
}
}
@@ -608,7 +620,7 @@ impl CaConnSet {
return Ok(());
}
self.thr_msg_storage_len
.trigger_fmt("msg", &[&self.storage_insert_sender.len()]);
.trigger("msg", &[&self.storage_insert_sender.len()]);
debug!("TODO handle_check_health");
// Trigger already the next health check, but use the current data that we have.
@@ -726,6 +738,7 @@ impl CaConnSet {
addr_v4,
add.local_epics_hostname,
self.channel_info_query_tx.clone(),
self.ca_conn_stats.clone(),
);
let conn_tx = conn.conn_command_tx();
let conn_stats = conn.stats();
@@ -1136,16 +1149,14 @@ impl Stream for CaConnSet {
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
use Poll::*;
self.stats.poll_fn_begin().inc();
debug!("CaConnSet poll");
loop {
self.stats.poll_loop_begin().inc();
let n1 = self.channel_info_query_queue.len();
let p2 = self.channel_info_query_sender.len();
let p3 = self.channel_info_res_rx.len();
self.thr_msg_poll_1
.trigger_fmt("CaConnSet channel_info_query_queue", &[&n1, &p2, &p3]);
self.thr_msg_poll_1.trigger("CaConnSet::poll", &[]);
self.stats.storage_insert_tx_len.set(self.storage_insert_tx.len() as _);
self.stats
.channel_info_query_queue_len
.set(self.channel_info_query_queue.len() as _);
self.stats
.channel_info_query_sender_len
.set(self.channel_info_query_sender.len().unwrap_or(0) as _);
@@ -1158,6 +1169,7 @@ impl Stream for CaConnSet {
self.stats.ca_conn_res_tx_len.set(self.ca_conn_res_tx.len() as _);
let mut have_pending = false;
let mut have_progress = false;
self.try_push_ca_conn_cmds();
@@ -1190,6 +1202,7 @@ impl Stream for CaConnSet {
error!("CaConn {addr} join error: {e}");
}
}
have_progress = true;
}
Pending => {
have_pending = true;
@@ -1206,7 +1219,9 @@ impl Stream for CaConnSet {
}
if self.storage_insert_sender.is_sending() {
match self.storage_insert_sender.poll_unpin(cx) {
Ready(Ok(())) => {}
Ready(Ok(())) => {
have_progress = true;
}
Ready(Err(_)) => {
let e = Error::with_msg_no_trace("can not send into channel");
error!("{e}");
@@ -1225,7 +1240,9 @@ impl Stream for CaConnSet {
}
if self.find_ioc_query_sender.is_sending() {
match self.find_ioc_query_sender.poll_unpin(cx) {
Ready(Ok(())) => {}
Ready(Ok(())) => {
have_progress = true;
}
Ready(Err(_)) => {
let e = Error::with_msg_no_trace("can not send into channel");
error!("{e}");
@@ -1244,7 +1261,9 @@ impl Stream for CaConnSet {
}
if self.channel_info_query_sender.is_sending() {
match self.channel_info_query_sender.poll_unpin(cx) {
Ready(Ok(())) => {}
Ready(Ok(())) => {
have_progress = true;
}
Ready(Err(_)) => {
let e = Error::with_msg_no_trace("can not send into channel");
error!("{e}");
@@ -1256,84 +1275,79 @@ impl Stream for CaConnSet {
}
}
let item = match self.find_ioc_res_rx.poll_next_unpin(cx) {
match self.find_ioc_res_rx.poll_next_unpin(cx) {
Ready(Some(x)) => match self.handle_ioc_query_result(x) {
Ok(()) => Ready(None),
Err(e) => Ready(Some(CaConnSetItem::Error(e))),
Ok(()) => {
have_progress = true;
}
Err(e) => break Ready(Some(CaConnSetItem::Error(e))),
},
Ready(None) => Ready(None),
Ready(None) => {}
Pending => {
have_pending = true;
Pending
}
};
match item {
Ready(Some(x)) => break Ready(Some(x)),
_ => {}
}
let item = match self.ca_conn_res_rx.poll_next_unpin(cx) {
match self.ca_conn_res_rx.poll_next_unpin(cx) {
Ready(Some((addr, ev))) => match self.handle_ca_conn_event(addr, ev) {
Ok(()) => Ready(None),
Err(e) => Ready(Some(CaConnSetItem::Error(e))),
Ok(()) => {
have_progress = true;
}
Err(e) => break Ready(Some(CaConnSetItem::Error(e))),
},
Ready(None) => Ready(None),
Ready(None) => {}
Pending => {
have_pending = true;
Pending
}
};
match item {
Ready(Some(x)) => break Ready(Some(x)),
_ => {}
}
let item = match self.channel_info_res_rx.poll_next_unpin(cx) {
match self.channel_info_res_rx.poll_next_unpin(cx) {
Ready(Some(x)) => match self.handle_series_lookup_result(x) {
Ok(()) => Ready(None),
Err(e) => Ready(Some(CaConnSetItem::Error(e))),
Ok(()) => {
have_progress = true;
}
Err(e) => break Ready(Some(CaConnSetItem::Error(e))),
},
Ready(None) => Ready(None),
Ready(None) => {}
Pending => {
have_pending = true;
Pending
}
};
match item {
Ready(Some(x)) => break Ready(Some(x)),
_ => {}
}
let item = match self.connset_inp_rx.poll_next_unpin(cx) {
match self.connset_inp_rx.poll_next_unpin(cx) {
Ready(Some(x)) => match self.handle_event(x) {
Ok(()) => Ready(None),
Err(e) => Ready(Some(CaConnSetItem::Error(e))),
Ok(()) => {
have_progress = true;
}
Err(e) => break Ready(Some(CaConnSetItem::Error(e))),
},
Ready(None) => Ready(None),
Ready(None) => {}
Pending => {
have_pending = true;
Pending
}
};
match item {
Ready(Some(x)) => break Ready(Some(x)),
_ => {}
}
break if self.ready_for_end_of_stream() {
if have_pending {
self.stats.ready_for_end_of_stream_with_pending().inc();
self.stats.ready_for_end_of_stream().inc();
if have_progress {
self.stats.ready_for_end_of_stream_with_progress().inc();
continue;
} else {
self.stats.ready_for_end_of_stream_no_pending().inc();
Ready(None)
}
Ready(None)
} else {
if have_pending {
self.stats.poll_pending().inc();
Pending
} else {
if have_progress {
self.stats.poll_reloop().inc();
continue;
} else {
if have_pending {
self.stats.poll_pending().inc();
Pending
} else {
self.stats.poll_no_progress_no_pending().inc();
let e = Error::with_msg_no_trace("no progress no pending");
Ready(Some(CaConnSetItem::Error(e)))
}
}
};
}

View File

@@ -89,7 +89,7 @@ async fn finder_worker_single(
backend: String,
db: Database,
) -> Result<(), Error> {
let pg = make_pg_client(&db)
let (pg, jh) = make_pg_client(&db)
.await
.map_err(|e| Error::with_msg_no_trace(e.to_string()))?;
let sql = concat!(

View File

@@ -70,7 +70,7 @@ impl DbUpdateWorker {
pub async fn ca_search(opts: CaIngestOpts, channels: &Vec<String>) -> Result<(), Error> {
info!("ca_search begin");
let pg = dbpg::conn::make_pg_client(opts.postgresql_config())
let (pg, jh) = dbpg::conn::make_pg_client(opts.postgresql_config())
.await
.map_err(|e| Error::with_msg_no_trace(e.to_string()))?;
dbpg::schema::schema_check(&pg)
@@ -123,7 +123,7 @@ pub async fn ca_search(opts: CaIngestOpts, channels: &Vec<String>) -> Result<(),
let mut dbworkers = Vec::new();
for _ in 0..DB_WORKER_COUNT {
let pg = dbpg::conn::make_pg_client(opts.postgresql_config())
let (pg, jh) = dbpg::conn::make_pg_client(opts.postgresql_config())
.await
.map_err(|e| Error::with_msg_no_trace(e.to_string()))?;
let w = DbUpdateWorker::new(dbrx.clone(), opts.backend().into(), pg).await?;

View File

@@ -26,7 +26,7 @@ pub struct CaIngestOpts {
timeout: Option<Duration>,
postgresql: Database,
scylla: ScyllaConfig,
array_truncate: Option<usize>,
array_truncate: Option<u64>,
insert_worker_count: Option<usize>,
insert_scylla_sessions: Option<usize>,
insert_queue_max: Option<usize>,
@@ -87,7 +87,7 @@ impl CaIngestOpts {
self.insert_queue_max.unwrap_or(64)
}
pub fn array_truncate(&self) -> usize {
pub fn array_truncate(&self) -> u64 {
self.array_truncate.unwrap_or(512)
}

View File

@@ -21,8 +21,10 @@ use stats::CaConnStatsAgg;
use stats::CaConnStatsAggDiff;
use stats::DaemonStats;
use stats::InsertWorkerStats;
use stats::SeriesByChannelStats;
use std::collections::HashMap;
use std::net::SocketAddrV4;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::time::Duration;
@@ -31,19 +33,28 @@ use taskrun::tokio;
pub struct StatsSet {
daemon: Arc<DaemonStats>,
ca_conn_set: Arc<CaConnSetStats>,
ca_conn: Arc<CaConnStats>,
insert_worker_stats: Arc<InsertWorkerStats>,
series_by_channel_stats: Arc<SeriesByChannelStats>,
insert_frac: Arc<AtomicU64>,
}
impl StatsSet {
pub fn new(
daemon: Arc<DaemonStats>,
ca_conn_set: Arc<CaConnSetStats>,
ca_conn: Arc<CaConnStats>,
insert_worker_stats: Arc<InsertWorkerStats>,
series_by_channel_stats: Arc<SeriesByChannelStats>,
insert_frac: Arc<AtomicU64>,
) -> Self {
Self {
daemon,
ca_conn_set,
ca_conn,
insert_worker_stats,
series_by_channel_stats,
insert_frac,
}
}
}
@@ -198,8 +209,12 @@ fn make_routes(dcom: Arc<DaemonComm>, connset_cmd_tx: Sender<CaConnSetEvent>, st
let mut s1 = stats_set.daemon.prometheus();
let s2 = stats_set.ca_conn_set.prometheus();
let s3 = stats_set.insert_worker_stats.prometheus();
let s4 = stats_set.ca_conn.prometheus();
let s5 = stats_set.series_by_channel_stats.prometheus();
s1.push_str(&s2);
s1.push_str(&s3);
s1.push_str(&s4);
s1.push_str(&s5);
s1
}
}),
@@ -241,7 +256,7 @@ fn make_routes(dcom: Arc<DaemonComm>, connset_cmd_tx: Sender<CaConnSetEvent>, st
}),
)
.route(
"/store_workers_rate",
"/daqingest/store_workers_rate",
get({
let dcom = dcom.clone();
|| async move { axum::Json(123) }
@@ -252,18 +267,20 @@ fn make_routes(dcom: Arc<DaemonComm>, connset_cmd_tx: Sender<CaConnSetEvent>, st
}),
)
.route(
"/insert_frac",
"/daqingest/insert_frac",
get({
let dcom = dcom.clone();
|| async move { axum::Json(123) }
let insert_frac = stats_set.insert_frac.clone();
|| async move { axum::Json(insert_frac.load(Ordering::Acquire)) }
})
.put({
let dcom = dcom.clone();
|v: extract::Json<u64>| async move {}
let insert_frac = stats_set.insert_frac.clone();
|v: extract::Json<u64>| async move {
insert_frac.store(v.0, Ordering::Release);
}
}),
)
.route(
"/extra_inserts_conf",
"/daqingest/extra_inserts_conf",
get({
let dcom = dcom.clone();
|| async move { axum::Json(serde_json::to_value(&"TODO").unwrap()) }
@@ -274,7 +291,7 @@ fn make_routes(dcom: Arc<DaemonComm>, connset_cmd_tx: Sender<CaConnSetEvent>, st
}),
)
.route(
"/insert_ivl_min",
"/daqingest/insert_ivl_min",
put({
let dcom = dcom.clone();
|v: extract::Json<u64>| async move {}

View File

@@ -18,16 +18,7 @@ impl ThrottleTrace {
}
}
pub fn trigger(&mut self, msg: &str) {
self.count += 1;
let tsnow = Instant::now();
if self.next <= tsnow {
self.next = tsnow + self.ivl;
debug!("{} (count {})", msg, self.count);
}
}
pub fn trigger_fmt(&mut self, msg: &str, params: &[&dyn fmt::Debug]) {
pub fn trigger(&mut self, msg: &str, params: &[&dyn fmt::Debug]) {
self.count += 1;
let tsnow = Instant::now();
if self.next <= tsnow {

View File

@@ -143,7 +143,7 @@ impl ConnTimeBin {
trace2!("TODO setup_event_acc {:?} {:?}", scalar_type, shape);
}
_ => {
warn!("TODO setup_event_acc {:?} {:?}", scalar_type, shape);
trace2!("TODO setup_event_acc {:?} {:?}", scalar_type, shape);
}
}
}
@@ -151,12 +151,12 @@ impl ConnTimeBin {
//type Cont<T> = EventsDim1<T>;
match scalar_type {
_ => {
warn!("TODO setup_event_acc {:?} {:?}", scalar_type, shape);
trace2!("TODO setup_event_acc {:?} {:?}", scalar_type, shape);
}
}
}
_ => {
warn!("TODO setup_event_acc {:?} {:?}", scalar_type, shape);
trace2!("TODO setup_event_acc {:?} {:?}", scalar_type, shape);
}
}
Ok(())