Simplify shutdown

This commit is contained in:
Dominik Werder
2023-09-08 16:07:08 +02:00
parent 4a31f3f81f
commit 8bbd6c37d1
10 changed files with 374 additions and 488 deletions

View File

@@ -6,8 +6,6 @@ pub mod proto;
pub mod search;
pub mod statemap;
use self::connset::CaConnSetCtrl;
use crate::ca::connset::CaConnSet;
use crate::metrics::ExtraInsertsConf;
use crate::rt::TokMx;
use futures_util::Future;
@@ -15,7 +13,6 @@ use futures_util::FutureExt;
use log::*;
use netpod::Database;
use scywr::insertworker::InsertWorkerOpts;
use scywr::iteminsertqueue::CommonInsertItemQueue;
use scywr::store::DataStore;
use stats::CaConnStatsAgg;
use std::pin::Pin;
@@ -39,7 +36,6 @@ pub struct IngestCommons {
pub pgconf: Arc<Database>,
pub backend: String,
pub local_epics_hostname: String,
pub insert_item_queue: Arc<CommonInsertItemQueue>,
pub data_store: Arc<DataStore>,
pub insert_ivl_min: Arc<AtomicU64>,
pub extra_inserts_conf: TokMx<ExtraInsertsConf>,

View File

@@ -262,6 +262,7 @@ enum CaConnState {
PeerReady,
Wait(Pin<Box<dyn Future<Output = ()> + Send>>),
Shutdown,
EndOfStream,
}
fn wait_fut(dt: u64) -> Pin<Box<dyn Future<Output = ()> + Send>> {
@@ -852,11 +853,11 @@ impl CaConn {
fn check_channels_alive(&mut self) -> Result<(), Error> {
let tsnow = Instant::now();
trace!("CheckChannelsAlive {addr:?}", addr = &self.remote_addr_dbg);
trace!("check_channels_alive {addr:?}", addr = &self.remote_addr_dbg);
if self.ioc_ping_last.elapsed() > Duration::from_millis(20000) {
if let Some(started) = self.ioc_ping_start {
if started.elapsed() > Duration::from_millis(4000) {
warn!("Echo timeout {addr:?}", addr = self.remote_addr_dbg);
warn!("pong timeout {addr:?}", addr = self.remote_addr_dbg);
let item = CaConnEvent {
ts: Instant::now(),
value: CaConnEventValue::EchoTimeout,
@@ -867,11 +868,11 @@ impl CaConn {
} else {
self.ioc_ping_start = Some(Instant::now());
if let Some(proto) = &mut self.proto {
debug!("push echo to {}", self.remote_addr_dbg);
debug!("ping to {}", self.remote_addr_dbg);
let msg = CaMsg { ty: CaMsgTy::Echo };
proto.push_out(msg);
} else {
warn!("can not push echo, no proto {}", self.remote_addr_dbg);
warn!("can not ping {} no proto", self.remote_addr_dbg);
self.trigger_shutdown(ChannelStatusClosedReason::NoProtocol);
}
}
@@ -1630,6 +1631,7 @@ impl CaConn {
Pending => Ok(Some(Pending)),
},
CaConnState::Shutdown => Ok(None),
CaConnState::EndOfStream => Ok(None),
}
}
@@ -1725,8 +1727,8 @@ impl CaConn {
Ok(())
}
fn outgoing_queues_empty(&self) -> bool {
self.channel_info_query_queue.is_empty() && !self.channel_info_query_sending.is_sending()
fn queues_async_out_flushed(&self) -> bool {
self.channel_info_query_queue.is_empty() && self.channel_info_query_sending.is_idle()
}
fn attempt_flush_channel_info_query(mut self: Pin<&mut Self>, cx: &mut Context) -> Result<(), Error> {
@@ -1741,7 +1743,7 @@ impl CaConn {
}
} else if let Some(item) = self.channel_info_query_queue.pop_front() {
let sd = &mut self.channel_info_query_sending;
sd.send2(item);
sd.send(item);
continue;
} else {
Ok(())
@@ -1758,7 +1760,9 @@ impl Stream for CaConn {
self.stats.caconn_poll_count_inc();
loop {
let mut have_pending = false;
break if let Err(e) = self.as_mut().handle_own_ticker(cx) {
break if let CaConnState::EndOfStream = self.state {
Ready(None)
} else if let Err(e) = self.as_mut().handle_own_ticker(cx) {
Ready(Some(Err(e)))
} else if let Some(item) = self.cmd_res_queue.pop_front() {
let item = CaConnEvent {
@@ -1779,21 +1783,17 @@ impl Stream for CaConn {
} else if let Ready(Some(Err(e))) = self.as_mut().handle_conn_command(cx) {
Ready(Some(Err(e)))
} else if let Some(item) = {
if self.is_shutdown() {
None
} else {
match self.loop_inner(cx) {
// TODO what does this mean: should we re-loop or yield something?
Ok(Some(Ready(()))) => None,
// This is the last step, so we yield Pending.
// But in general, this does not compose well when we would add another step.
Ok(Some(Pending)) => {
have_pending = true;
None
}
Ok(None) => None,
Err(e) => Some(Err(e)),
match self.loop_inner(cx) {
// TODO what does this mean: should we re-loop or yield something?
Ok(Some(Ready(()))) => None,
// This is the last step, so we yield Pending.
// But in general, this does not compose well when we would add another step.
Ok(Some(Pending)) => {
have_pending = true;
None
}
Ok(None) => None,
Err(e) => Some(Err(e)),
}
} {
Ready(Some(item))
@@ -1804,7 +1804,10 @@ impl Stream for CaConn {
ts: Instant::now(),
value: CaConnEventValue::None,
};
if have_pending {
if self.is_shutdown() && self.queues_async_out_flushed() {
self.state = CaConnState::EndOfStream;
Ready(None)
} else if have_pending {
Pending
} else {
continue;

View File

@@ -109,12 +109,18 @@ pub struct ChannelAdd {
local_epics_hostname: String,
}
#[derive(Debug, Clone)]
pub struct ChannelRemove {
name: String,
}
#[derive(Debug)]
pub enum ConnSetCmd {
SeriesLookupResult(Result<ChannelInfoResult, dbpg::seriesbychannel::Error>),
ChannelAdd(ChannelAdd),
ChannelAddWithStatusId(ChannelAddWithStatusId),
ChannelAddWithAddr(ChannelAddWithAddr),
ChannelRemove(ChannelRemove),
IocAddrQueryResult(VecDeque<FindIocRes>),
CheckHealth,
Shutdown,
@@ -126,18 +132,22 @@ pub enum CaConnSetEvent {
CaConnEvent((SocketAddr, CaConnEvent)),
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub enum CaConnSetItem {
Healthy,
}
#[derive(Clone)]
pub struct CaConnSetCtrl {
tx: Sender<CaConnSetEvent>,
pub rx: Receiver<CaConnSetItem>,
rx: Receiver<CaConnSetItem>,
jh: JoinHandle<Result<(), Error>>,
}
impl CaConnSetCtrl {
pub fn receiver(&self) -> Receiver<CaConnSetItem> {
self.rx.clone()
}
pub async fn add_channel(&self, backend: String, name: String, local_epics_hostname: String) -> Result<(), Error> {
let cmd = ChannelAdd {
backend,
@@ -149,6 +159,13 @@ impl CaConnSetCtrl {
Ok(())
}
pub async fn remove_channel(&self, name: String) -> Result<(), Error> {
let cmd = ChannelRemove { name };
let cmd = ConnSetCmd::ChannelRemove(cmd);
self.tx.send(CaConnSetEvent::ConnSetCmd(cmd)).await?;
Ok(())
}
pub async fn shutdown(&self) -> Result<(), Error> {
let cmd = ConnSetCmd::Shutdown;
self.tx.send(CaConnSetEvent::ConnSetCmd(cmd)).await?;
@@ -160,6 +177,11 @@ impl CaConnSetCtrl {
self.tx.send(CaConnSetEvent::ConnSetCmd(cmd)).await?;
Ok(())
}
pub async fn join(self) -> Result<(), Error> {
self.jh.await.map_err(|e| Error::with_msg_no_trace(e.to_string()))??;
Ok(())
}
}
#[derive(Debug)]
@@ -198,6 +220,7 @@ pub struct CaConnSet {
chan_check_next: Option<Channel>,
stats: CaConnSetStats,
connset_out_tx: Sender<CaConnSetItem>,
ioc_finder_jh: JoinHandle<Result<(), Error>>,
}
impl CaConnSet {
@@ -226,12 +249,14 @@ impl CaConnSet {
chan_check_next: None,
stats: CaConnSetStats::new(),
connset_out_tx,
ioc_finder_jh,
};
// TODO await on jh
let jh = tokio::spawn(CaConnSet::run(connset));
CaConnSetCtrl {
tx: connset_tx,
rx: connset_out_rx,
jh,
}
}
@@ -241,15 +266,31 @@ impl CaConnSet {
match x {
Ok(ev) => this.handle_event(ev).await?,
Err(_) => {
if this.shutdown_done {
if this.shutdown_stopping {
// all fine
break Ok(());
break;
} else {
error!("channel closed without shutdown_done");
error!("channel closed without shutdown_stopping");
}
}
}
if this.shutdown_stopping {
break;
}
}
debug!(
"search_tx sender {} receiver {}",
this.search_tx.sender_count(),
this.search_tx.receiver_count()
);
this.ioc_finder_jh
.await
.map_err(|e| Error::with_msg_no_trace(e.to_string()))??;
debug!("joined ioc_finder_jh");
this.connset_out_tx.close();
this.connset_rx.close();
this.shutdown_done = true;
Ok(())
}
async fn handle_event(&mut self, ev: CaConnSetEvent) -> Result<(), Error> {
@@ -258,6 +299,7 @@ impl CaConnSet {
ConnSetCmd::ChannelAdd(x) => self.handle_add_channel(x).await,
ConnSetCmd::ChannelAddWithStatusId(x) => self.handle_add_channel_with_status_id(x).await,
ConnSetCmd::ChannelAddWithAddr(x) => self.handle_add_channel_with_addr(x).await,
ConnSetCmd::ChannelRemove(x) => self.handle_remove_channel(x).await,
ConnSetCmd::IocAddrQueryResult(x) => self.handle_ioc_query_result(x).await,
ConnSetCmd::SeriesLookupResult(x) => self.handle_series_lookup_result(x).await,
ConnSetCmd::CheckHealth => self.handle_check_health().await,
@@ -301,6 +343,10 @@ impl CaConnSet {
}
async fn handle_add_channel(&mut self, add: ChannelAdd) -> Result<(), Error> {
if self.shutdown_stopping {
debug!("handle_add_channel but shutdown_stopping");
return Ok(());
}
// TODO should I add the transition through ActiveChannelState::Init as well?
let ch = Channel::new(add.name.clone());
let _st = self.channel_states.inner().entry(ch).or_insert_with(|| ChannelState {
@@ -322,6 +368,10 @@ impl CaConnSet {
}
async fn handle_add_channel_with_status_id(&mut self, add: ChannelAddWithStatusId) -> Result<(), Error> {
if self.shutdown_stopping {
debug!("handle_add_channel but shutdown_stopping");
return Ok(());
}
debug!("handle_add_channel_with_status_id {add:?}");
let ch = Channel::new(add.name.clone());
if let Some(chst) = self.channel_states.inner().get_mut(&ch) {
@@ -350,6 +400,10 @@ impl CaConnSet {
}
async fn handle_add_channel_with_addr(&mut self, add: ChannelAddWithAddr) -> Result<(), Error> {
if self.shutdown_stopping {
debug!("handle_add_channel but shutdown_stopping");
return Ok(());
}
if !self.ca_conn_ress.contains_key(&add.addr) {
let c = self.create_ca_conn(add.clone())?;
self.ca_conn_ress.insert(add.addr, c);
@@ -360,6 +414,43 @@ impl CaConnSet {
Ok(())
}
async fn handle_remove_channel(&mut self, add: ChannelRemove) -> Result<(), Error> {
let ch = Channel::new(add.name);
if let Some(k) = self.channel_states.inner().get_mut(&ch) {
match &k.value {
ChannelStateValue::Active(j) => match j {
ActiveChannelState::Init { .. } => {
k.value = ChannelStateValue::ToRemove { addr: None };
}
ActiveChannelState::WaitForStatusSeriesId { .. } => {
k.value = ChannelStateValue::ToRemove { addr: None };
}
ActiveChannelState::WithStatusSeriesId {
status_series_id: _,
state,
} => match &state.inner {
WithStatusSeriesIdStateInner::UnknownAddress { .. } => {
k.value = ChannelStateValue::ToRemove { addr: None };
}
WithStatusSeriesIdStateInner::SearchPending { .. } => {
k.value = ChannelStateValue::ToRemove { addr: None };
}
WithStatusSeriesIdStateInner::WithAddress { addr, state: _ } => {
k.value = ChannelStateValue::ToRemove {
addr: Some(addr.clone()),
};
}
WithStatusSeriesIdStateInner::NoAddress { .. } => {
k.value = ChannelStateValue::ToRemove { addr: None };
}
},
},
ChannelStateValue::ToRemove { .. } => {}
}
}
Ok(())
}
async fn handle_ioc_query_result(&mut self, res: VecDeque<FindIocRes>) -> Result<(), Error> {
for e in res {
let ch = Channel::new(e.channel.clone());
@@ -416,6 +507,7 @@ impl CaConnSet {
debug!("TODO handle_shutdown");
debug!("shutdown received");
self.shutdown_stopping = true;
self.search_tx.close();
for (addr, res) in self.ca_conn_ress.iter() {
let item = ConnCommand::shutdown();
res.sender.send(item).await?;
@@ -428,16 +520,20 @@ impl CaConnSet {
if let Some(e) = self.ca_conn_ress.remove(&addr) {
match e.jh.await {
Ok(Ok(())) => {
self.stats.ca_conn_task_join_done_ok_inc();
debug!("CaConn {addr} finished well");
}
Ok(Err(e)) => {
self.stats.ca_conn_task_join_done_err_inc();
error!("CaConn {addr} task error: {e}");
}
Err(e) => {
self.stats.ca_conn_task_join_err_inc();
error!("CaConn {addr} join error: {e}");
}
}
} else {
self.stats.ca_conn_task_eos_non_exist_inc();
warn!("end-of-stream received for non-existent CaConn {addr}");
}
Ok(())

View File

@@ -338,14 +338,9 @@ fn start_finder_ca(tx: Sender<DaemonEvent>, tgts: Vec<SocketAddrV4>) -> (Sender<
taskrun::spawn({
async move {
while let Ok(item) = arx.recv().await {
match tx.send(DaemonEvent::SearchDone(item)).await {
Ok(_) => {}
Err(e) => {
error!("search res fwd {e}");
}
}
todo!("send the result item");
}
warn!("search res fwd nput broken");
warn!("search res fwd inp closed");
}
});
(qtx, ioc_finder_jh)

View File

@@ -1,11 +1,6 @@
use crate::ca::conn::CaConnEvent;
use crate::ca::connset::CaConnSetItem;
use crate::ca::findioc::FindIocRes;
use async_channel::Sender;
use err::Error;
use serde::Serialize;
use std::collections::VecDeque;
use std::net::SocketAddrV4;
#[derive(Clone, Debug, Serialize, PartialEq, PartialOrd, Eq, Ord)]
pub struct Channel {
@@ -22,13 +17,11 @@ impl Channel {
}
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub enum DaemonEvent {
TimerTick(u32, Sender<u32>),
ChannelAdd(Channel),
ChannelRemove(Channel),
SearchDone(Result<VecDeque<FindIocRes>, Error>),
CaConnEvent(SocketAddrV4, CaConnEvent),
CaConnSetItem(CaConnSetItem),
Shutdown,
}
@@ -40,17 +33,6 @@ impl DaemonEvent {
TimerTick(_, _) => format!("TimerTick"),
ChannelAdd(x) => format!("ChannelAdd {x:?}"),
ChannelRemove(x) => format!("ChannelRemove {x:?}"),
SearchDone(_x) => format!("SearchDone"),
CaConnEvent(_a, b) => {
use crate::ca::conn::CaConnEventValue::*;
match &b.value {
None => format!("CaConnEvent/None"),
EchoTimeout => format!("CaConnEvent/EchoTimeout"),
ConnCommandResult(_) => format!("CaConnEvent/ConnCommandResult"),
QueryItem(_) => format!("CaConnEvent/QueryItem"),
EndOfStream => format!("CaConnEvent/EndOfStream"),
}
}
CaConnSetItem(_) => format!("CaConnSetItem"),
Shutdown => format!("Shutdown"),
}

View File

@@ -270,11 +270,10 @@ pub async fn metrics_agg_task(
}
}
{
let val = ingest_commons
.insert_item_queue
.receiver()
.map_or(0, |x| x.len() as u64);
agg.store_worker_recv_queue_len.store(val, Ordering::Release);
warn!("TODO provide metrics with a weak ref to the query_item_channel");
let nitems = 0;
// let nitems = weak.upgrade()..len();
agg.store_worker_recv_queue_len.store(nitems, Ordering::Release);
}
let mut m = METRICS.lock().unwrap();
*m = Some(agg.clone());

View File

@@ -42,11 +42,15 @@ impl<T> SenderPolling<T> {
ret
}
pub fn is_idle(&self) -> bool {
self.fut.is_none()
}
pub fn is_sending(&self) -> bool {
self.fut.is_some()
}
pub fn send(self: Pin<&mut Self>, item: T) {
pub fn send_pin(self: Pin<&mut Self>, item: T) {
let (tx, fut) = unsafe {
let x = Pin::get_unchecked_mut(self);
(x.sender_ptr.as_mut(), &mut x.fut)
@@ -55,7 +59,7 @@ impl<T> SenderPolling<T> {
*fut = Some(s);
}
pub fn send2(&mut self, item: T) {
pub fn send(&mut self, item: T) {
let sender = unsafe { self.sender_ptr.as_mut() };
let s = sender.send(item);
self.fut = Some(s);