WIP refactor

This commit is contained in:
Dominik Werder
2023-09-06 17:34:27 +02:00
parent 76915d5d82
commit ba9bb7e26c
13 changed files with 1028 additions and 891 deletions

View File

@@ -43,3 +43,4 @@ items_2 = { path = "../../daqbuffer/crates/items_2" }
streams = { path = "../../daqbuffer/crates/streams" }
taskrun = { path = "../../daqbuffer/crates/taskrun" }
bitshuffle = { path = "../../daqbuffer/crates/bitshuffle" }
batchtools = { path = "../batchtools" }

View File

@@ -1,8 +1,10 @@
pub mod conn;
pub mod connset;
pub mod finder;
pub mod findioc;
pub mod proto;
pub mod search;
pub mod statemap;
use self::connset::CaConnSetCtrl;
use crate::ca::connset::CaConnSet;

View File

@@ -9,6 +9,7 @@ use crate::ca::proto::EventAdd;
use crate::senderpolling::SenderPolling;
use crate::timebin::ConnTimeBin;
use async_channel::Sender;
use dbpg::seriesbychannel::CanSendChannelInfoResult;
use dbpg::seriesbychannel::ChannelInfoQuery;
use err::Error;
use futures_util::stream::FuturesUnordered;
@@ -306,6 +307,7 @@ fn info_store_msp_from_time(ts: SystemTime) -> u32 {
#[derive(Debug)]
pub enum ConnCommandKind {
SeriesLookupResult(Result<Existence<SeriesId>, dbpg::seriesbychannel::Error>),
ChannelAdd(String, ChannelStatusSeriesId),
ChannelRemove(String),
CheckHealth,
@@ -319,6 +321,13 @@ pub struct ConnCommand {
}
impl ConnCommand {
pub fn series_lookup(qu: Result<Existence<SeriesId>, dbpg::seriesbychannel::Error>) -> Self {
Self {
id: Self::make_id(),
kind: ConnCommandKind::SeriesLookupResult(qu),
}
}
pub fn channel_add(name: String, cssid: ChannelStatusSeriesId) -> Self {
Self {
id: Self::make_id(),
@@ -389,6 +398,21 @@ enum ChannelSetOp {
Remove,
}
struct SendSeriesLookup {
tx: Sender<ConnCommand>,
}
impl CanSendChannelInfoResult for SendSeriesLookup {
fn make_send(
&self,
item: Result<Existence<SeriesId>, dbpg::seriesbychannel::Error>,
) -> dbpg::seriesbychannel::BoxedSend {
let tx = self.tx.clone();
let fut = async move { tx.send(ConnCommand::series_lookup(item)).await.map_err(|_| ()) };
Box::pin(fut)
}
}
struct ChannelOpsResources<'a> {
channel_set_ops: &'a StdMutex<BTreeMap<String, ChannelSetOp>>,
channels: &'a mut BTreeMap<Cid, ChannelState>,
@@ -639,6 +663,7 @@ impl CaConn {
self.cmd_shutdown();
Ready(Some(Ok(())))
}
ConnCommandKind::SeriesLookupResult(_) => todo!("TODO handle SeriesLookupResult"),
}
}
Ready(None) => {
@@ -962,24 +987,28 @@ impl CaConn {
let _ = cx;
loop {
break if let Some(mut entry) = self.series_lookup_schedule.first_entry() {
let dummy = entry.get().dummy();
let query = std::mem::replace(entry.get_mut(), dummy);
match self.channel_info_query_tx.try_send(query) {
Ok(()) => {
entry.remove();
continue;
}
Err(e) => match e {
async_channel::TrySendError::Full(_) => {
warn!("series lookup channel full");
*entry.get_mut() = e.into_inner();
}
async_channel::TrySendError::Closed(_) => {
warn!("series lookup channel closed");
// *entry.get_mut() = e.into_inner();
todo!("emit_series_lookup");
#[cfg(DISABLED)]
{
let dummy = entry.get().dummy();
let query = std::mem::replace(entry.get_mut(), dummy);
match self.channel_info_query_tx.try_send(query) {
Ok(()) => {
entry.remove();
continue;
}
},
Err(e) => match e {
async_channel::TrySendError::Full(_) => {
warn!("series lookup channel full");
*entry.get_mut() = e.into_inner();
}
async_channel::TrySendError::Closed(_) => {
warn!("series lookup channel closed");
// *entry.get_mut() = e.into_inner();
entry.remove();
}
},
}
}
} else {
()
@@ -1439,29 +1468,32 @@ impl CaConn {
*ch_s = ChannelState::FetchingSeriesId(created_state);
// TODO handle error in different way. Should most likely not abort.
if !self.series_lookup_schedule.contains_key(&cid) {
let (tx, rx) = async_channel::bounded(1);
let tx = SendSeriesLookup {
tx: self.conn_command_tx.clone(),
};
let query = ChannelInfoQuery {
backend: self.backend.clone(),
channel: name.clone(),
scalar_type: scalar_type.to_scylla_i32(),
shape_dims: shape.to_scylla_vec(),
tx,
tx: Box::pin(tx),
};
self.series_lookup_schedule.insert(cid, query);
let fut = async move {
match rx.recv().await {
Ok(item) => match item {
Ok(item) => Ok((cid, sid, k.data_type, k.data_count, item)),
Err(e) => Err(Error::with_msg_no_trace(e.to_string())),
},
Err(e) => {
// TODO count only
error!("can not receive series lookup result for {name} {e}");
Err(Error::with_msg_no_trace("can not receive lookup result"))
}
}
};
self.series_lookup_futs.push(Box::pin(fut));
todo!("TODO discover the series lookup from main command queue");
// let fut = async move {
// match rx.recv().await {
// Ok(item) => match item {
// Ok(item) => Ok((cid, sid, k.data_type, k.data_count, item)),
// Err(e) => Err(Error::with_msg_no_trace(e.to_string())),
// },
// Err(e) => {
// // TODO count only
// error!("can not receive series lookup result for {name} {e}");
// Err(Error::with_msg_no_trace("can not receive lookup result"))
// }
// }
// };
// self.series_lookup_futs.push(Box::pin(fut));
} else {
// TODO count only
warn!("series lookup for {name} already in progress");

View File

@@ -1,34 +1,77 @@
use super::conn::CaConnEvent;
use super::conn::ConnCommand;
use super::findioc::FindIocRes;
use super::statemap;
use super::statemap::ChannelState;
use crate::ca::conn::CaConn;
use crate::ca::conn::CaConnEvent;
use crate::ca::conn::CaConnEventValue;
use crate::ca::conn::CaConnOpts;
use crate::ca::conn::ConnCommand;
use crate::ca::statemap::CaConnState;
use crate::ca::statemap::ConnectionState;
use crate::ca::statemap::ConnectionStateValue;
use crate::ca::statemap::WithAddressState;
use crate::daemon_common::Channel;
use crate::errconv::ErrConv;
use crate::rt::JoinHandle;
use crate::rt::TokMx;
use async_channel::Receiver;
use async_channel::Sender;
use atomic::AtomicUsize;
use dbpg::seriesbychannel::BoxedSend;
use dbpg::seriesbychannel::CanSendChannelInfoResult;
use dbpg::seriesbychannel::ChannelInfoQuery;
use err::Error;
use futures_util::FutureExt;
use futures_util::StreamExt;
use netpod::log::*;
use log::*;
use netpod::Database;
use netpod::Shape;
use scywr::iteminsertqueue::ChannelStatusItem;
use scywr::iteminsertqueue::QueryItem;
use series::series::Existence;
use series::ChannelStatusSeriesId;
use series::SeriesId;
use statemap::ActiveChannelState;
use statemap::CaConnStateValue;
use statemap::ChannelStateMap;
use statemap::ChannelStateValue;
use statemap::WithStatusSeriesIdState;
use statemap::WithStatusSeriesIdStateInner;
use statemap::CHANNEL_STATUS_DUMMY_SCALAR_TYPE;
use stats::CaConnSetStats;
use stats::CaConnStats;
use std::collections::BTreeMap;
use std::collections::VecDeque;
use std::net::SocketAddr;
use std::net::SocketAddrV4;
use std::sync::atomic;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use taskrun::tokio;
const DO_ASSIGN_TO_CA_CONN: bool = true;
const CHECK_CHANS_PER_TICK: usize = 10000;
pub const SEARCH_BATCH_MAX: usize = 256;
pub const CURRENT_SEARCH_PENDING_MAX: usize = SEARCH_BATCH_MAX * 4;
const UNKNOWN_ADDRESS_STAY: Duration = Duration::from_millis(2000);
const NO_ADDRESS_STAY: Duration = Duration::from_millis(20000);
const SEARCH_PENDING_TIMEOUT: Duration = Duration::from_millis(30000);
const SEARCH_PENDING_TIMEOUT_WARN: Duration = Duration::from_millis(8000);
// TODO put all these into metrics
static SEARCH_REQ_MARK_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_REQ_SEND_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_REQ_RECV_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_REQ_BATCH_SEND_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_ANS_COUNT: AtomicUsize = AtomicUsize::new(0);
#[derive(Debug, PartialEq, Eq)]
pub struct CmdId(SocketAddrV4, usize);
pub struct CaConnRes {
state: CaConnState,
sender: Sender<ConnCommand>,
stats: Arc<CaConnStats>,
// TODO await on jh
@@ -41,18 +84,37 @@ impl CaConnRes {
}
}
#[derive(Debug, Clone)]
pub struct ChannelAddWithAddr {
backend: String,
name: String,
local_epics_hostname: String,
cssid: ChannelStatusSeriesId,
addr: SocketAddr,
}
#[derive(Debug, Clone)]
pub struct ChannelAddWithStatusId {
backend: String,
name: String,
local_epics_hostname: String,
cssid: ChannelStatusSeriesId,
}
#[derive(Debug, Clone)]
pub struct ChannelAdd {
backend: String,
name: String,
addr: SocketAddr,
cssid: ChannelStatusSeriesId,
local_epics_hostname: String,
}
#[derive(Debug)]
pub enum ConnSetCmd {
SeriesLookupResult(Result<Existence<SeriesId>, dbpg::seriesbychannel::Error>),
ChannelAdd(ChannelAdd),
ChannelAddWithStatusId(ChannelAddWithStatusId),
ChannelAddWithAddr(ChannelAddWithAddr),
IocAddrQueryResult(VecDeque<FindIocRes>),
CheckHealth,
Shutdown,
}
@@ -69,19 +131,10 @@ pub struct CaConnSetCtrl {
}
impl CaConnSetCtrl {
pub async fn add_channel(
&self,
backend: String,
addr: SocketAddr,
name: String,
cssid: ChannelStatusSeriesId,
local_epics_hostname: String,
) -> Result<(), Error> {
pub async fn add_channel(&self, backend: String, name: String, local_epics_hostname: String) -> Result<(), Error> {
let cmd = ChannelAdd {
backend,
name,
addr,
cssid,
local_epics_hostname,
};
let cmd = ConnSetCmd::ChannelAdd(cmd);
@@ -102,28 +155,65 @@ impl CaConnSetCtrl {
}
}
#[derive(Debug)]
pub struct IocAddrQuery {
pub name: String,
}
struct SeriesLookupSender {
tx: Sender<CaConnSetEvent>,
}
impl CanSendChannelInfoResult for SeriesLookupSender {
fn make_send(&self, item: Result<Existence<SeriesId>, dbpg::seriesbychannel::Error>) -> BoxedSend {
let tx = self.tx.clone();
let fut = async move {
tx.send(CaConnSetEvent::ConnSetCmd(ConnSetCmd::SeriesLookupResult(item)))
.await
.map_err(|_| ())
};
Box::pin(fut)
}
}
pub struct CaConnSet {
backend: String,
local_epics_hostname: String,
search_tx: Sender<IocAddrQuery>,
ca_conn_ress: BTreeMap<SocketAddr, CaConnRes>,
channel_states: ChannelStateMap,
connset_tx: Sender<CaConnSetEvent>,
connset_rx: Receiver<CaConnSetEvent>,
channel_info_query_tx: Sender<ChannelInfoQuery>,
storage_insert_tx: Sender<QueryItem>,
shutdown: bool,
chan_check_next: Option<Channel>,
stats: CaConnSetStats,
}
impl CaConnSet {
pub fn start(
backend: String,
local_epics_hostname: String,
storage_insert_tx: Sender<QueryItem>,
channel_info_query_tx: Sender<ChannelInfoQuery>,
pgconf: Database,
) -> CaConnSetCtrl {
let (connset_tx, connset_rx) = async_channel::bounded(10000);
let (search_tx, ioc_finder_jh) = super::finder::start_finder(connset_tx.clone(), backend.clone(), pgconf);
let connset = Self {
backend,
local_epics_hostname,
search_tx,
ca_conn_ress: BTreeMap::new(),
channel_states: ChannelStateMap::new(),
connset_tx: connset_tx.clone(),
connset_rx,
channel_info_query_tx,
storage_insert_tx,
shutdown: false,
chan_check_next: None,
stats: CaConnSetStats::new(),
};
// TODO await on jh
let jh = tokio::spawn(CaConnSet::run(connset));
@@ -150,7 +240,40 @@ impl CaConnSet {
async fn handle_event(&mut self, ev: CaConnSetEvent) -> Result<(), Error> {
match ev {
CaConnSetEvent::ConnSetCmd(cmd) => match cmd {
ConnSetCmd::ChannelAdd(x) => self.add_channel_to_addr(x).await,
ConnSetCmd::ChannelAdd(x) => self.handle_add_channel(x).await,
ConnSetCmd::ChannelAddWithStatusId(x) => self.handle_add_channel_with_status_id(x).await,
ConnSetCmd::IocAddrQueryResult(res) => {
for e in res {
if let Some(addr) = e.addr {
let ch = Channel::new(e.channel.clone());
if let Some(chst) = self.channel_states.inner().get(&ch) {
if let ChannelStateValue::Active(ast) = &chst.value {
if let ActiveChannelState::WithStatusSeriesId {
status_series_id,
state,
} = ast
{
let add = ChannelAddWithAddr {
backend: self.backend.clone(),
name: e.channel,
addr: SocketAddr::V4(addr),
cssid: status_series_id.clone(),
local_epics_hostname: self.local_epics_hostname.clone(),
};
} else {
warn!("TODO got address but no longer active");
}
} else {
warn!("TODO got address but no longer active");
}
} else {
warn!("ioc addr lookup done but channel no longer here");
}
}
}
Ok(())
}
ConnSetCmd::ChannelAddWithAddr(x) => self.handle_add_channel_with_addr(x).await,
ConnSetCmd::CheckHealth => {
error!("TODO implement check health");
Ok(())
@@ -160,6 +283,7 @@ impl CaConnSet {
self.shutdown = true;
Ok(())
}
ConnSetCmd::SeriesLookupResult(_) => todo!(),
},
CaConnSetEvent::CaConnEvent((addr, ev)) => match ev.value {
CaConnEventValue::None => Ok(()),
@@ -174,7 +298,53 @@ impl CaConnSet {
}
}
async fn add_channel_to_addr(&mut self, add: ChannelAdd) -> Result<(), Error> {
async fn handle_add_channel(&mut self, add: ChannelAdd) -> Result<(), Error> {
// TODO should I add the transition through ActiveChannelState::Init as well?
let ch = Channel::new(add.name.clone());
let _st = self.channel_states.inner().entry(ch).or_insert_with(|| ChannelState {
value: ChannelStateValue::Active(ActiveChannelState::WaitForStatusSeriesId {
since: SystemTime::now(),
}),
});
let item = ChannelInfoQuery {
backend: add.backend,
channel: add.name,
scalar_type: CHANNEL_STATUS_DUMMY_SCALAR_TYPE,
shape_dims: Vec::new(),
tx: Box::pin(SeriesLookupSender {
tx: self.connset_tx.clone(),
}),
};
self.channel_info_query_tx.send(item).await?;
Ok(())
}
async fn handle_add_channel_with_status_id(&mut self, add: ChannelAddWithStatusId) -> Result<(), Error> {
let ch = Channel::new(add.name.clone());
if let Some(chst) = self.channel_states.inner().get_mut(&ch) {
if let ChannelStateValue::Active(chst2) = &mut chst.value {
if let ActiveChannelState::WaitForStatusSeriesId { .. } = chst2 {
*chst2 = ActiveChannelState::WithStatusSeriesId {
status_series_id: add.cssid,
state: WithStatusSeriesIdState {
inner: WithStatusSeriesIdStateInner::NoAddress {
since: SystemTime::now(),
},
},
};
} else {
warn!("TODO have a status series id but no more channel");
}
} else {
warn!("TODO have a status series id but no more channel");
}
} else {
warn!("TODO have a status series id but no more channel");
}
Ok(())
}
async fn handle_add_channel_with_addr(&mut self, add: ChannelAddWithAddr) -> Result<(), Error> {
if !self.ca_conn_ress.contains_key(&add.addr) {
let c = self.create_ca_conn(add.clone())?;
self.ca_conn_ress.insert(add.addr, c);
@@ -185,7 +355,7 @@ impl CaConnSet {
Ok(())
}
fn create_ca_conn(&self, add: ChannelAdd) -> Result<CaConnRes, Error> {
fn create_ca_conn(&self, add: ChannelAddWithAddr) -> Result<CaConnRes, Error> {
// TODO should we save this as event?
let opts = CaConnOpts::default();
let addr = add.addr;
@@ -207,6 +377,7 @@ impl CaConnSet {
let conn_item_tx = self.connset_tx.clone();
let jh = tokio::spawn(Self::ca_conn_item_merge(conn, conn_item_tx, addr_v4));
let ca_conn_res = CaConnRes {
state: CaConnState::new(CaConnStateValue::Fresh),
sender: conn_tx,
stats: conn_stats,
jh,
@@ -354,4 +525,221 @@ impl CaConnSet {
Ok(false)
}
}
fn check_connection_states(&mut self) -> Result<(), Error> {
let tsnow = Instant::now();
for (addr, val) in &mut self.ca_conn_ress {
let state = &mut val.state;
let v = &mut state.value;
match v {
CaConnStateValue::Fresh => {
// TODO check for delta t since last issued status command.
if tsnow.duration_since(state.last_feedback) > Duration::from_millis(20000) {
error!("TODO Fresh timeout send connection-close for {addr}");
// TODO collect in metrics
// self.stats.ca_conn_status_feedback_timeout_inc();
// TODO send shutdown to this CaConn, check that we've received
// a 'shutdown' state from it. (see below)
*v = CaConnStateValue::Shutdown { since: tsnow };
}
}
CaConnStateValue::HadFeedback => {
// TODO check for delta t since last issued status command.
if tsnow.duration_since(state.last_feedback) > Duration::from_millis(20000) {
error!("TODO HadFeedback timeout send connection-close for {addr}");
// TODO collect in metrics
// self.stats.ca_conn_status_feedback_timeout_inc();
*v = CaConnStateValue::Shutdown { since: tsnow };
}
}
CaConnStateValue::Shutdown { since } => {
if tsnow.saturating_duration_since(*since) > Duration::from_millis(10000) {
// TODO collect in metrics as severe error, this would be a bug.
// self.stats.critical_error_inc();
error!("Shutdown of CaConn failed for {addr}");
}
}
}
}
Ok(())
}
async fn check_channel_states(&mut self) -> Result<(), Error> {
let (mut search_pending_count,) = self.update_channel_state_counts();
let k = self.chan_check_next.take();
let it = if let Some(last) = k {
trace!("check_chans start at {:?}", last);
self.channel_states.inner().range_mut(last..)
} else {
self.channel_states.inner().range_mut(..)
};
let tsnow = SystemTime::now();
let mut attempt_series_search = true;
for (i, (ch, st)) in it.enumerate() {
match &mut st.value {
ChannelStateValue::Active(st2) => match st2 {
ActiveChannelState::Init { since: _ } => {
todo!()
}
ActiveChannelState::WaitForStatusSeriesId { since } => {
let dt = tsnow.duration_since(*since).unwrap_or(Duration::ZERO);
if dt > Duration::from_millis(5000) {
warn!("timeout can not get status series id for {ch:?}");
*st2 = ActiveChannelState::Init { since: tsnow };
} else {
// TODO
}
}
ActiveChannelState::WithStatusSeriesId {
status_series_id,
state,
} => match &mut state.inner {
WithStatusSeriesIdStateInner::UnknownAddress { since } => {
let dt = tsnow.duration_since(*since).unwrap_or(Duration::ZERO);
if dt > UNKNOWN_ADDRESS_STAY {
//info!("UnknownAddress {} {:?}", i, ch);
if (search_pending_count as usize) < CURRENT_SEARCH_PENDING_MAX {
search_pending_count += 1;
state.inner = WithStatusSeriesIdStateInner::SearchPending {
since: tsnow,
did_send: false,
};
SEARCH_REQ_MARK_COUNT.fetch_add(1, atomic::Ordering::AcqRel);
}
}
}
WithStatusSeriesIdStateInner::SearchPending { since, did_send: _ } => {
//info!("SearchPending {} {:?}", i, ch);
let dt = tsnow.duration_since(*since).unwrap_or(Duration::ZERO);
if dt > SEARCH_PENDING_TIMEOUT {
info!("Search timeout for {ch:?}");
state.inner = WithStatusSeriesIdStateInner::NoAddress { since: tsnow };
search_pending_count -= 1;
}
}
WithStatusSeriesIdStateInner::WithAddress { addr: addr_v4, state } => {
//info!("WithAddress {} {:?}", i, ch);
use WithAddressState::*;
match state {
Unassigned { assign_at } => {
// TODO do I need this case anymore?
#[cfg(DISABLED)]
if DO_ASSIGN_TO_CA_CONN && *assign_at <= tsnow {
let backend = self.backend.clone();
let addr = SocketAddr::V4(*addr_v4);
let name = ch.id().into();
let cssid = status_series_id.clone();
let local_epics_hostname = self.local_epics_hostname.clone();
// This operation is meant to complete very quickly
let add = ChannelAdd {
backend: backend,
name: name,
addr,
cssid,
local_epics_hostname,
};
self.handle_add_channel(add).await?;
let cs = ConnectionState {
updated: tsnow,
value: ConnectionStateValue::Unconnected,
};
// TODO if a matching CaConn does not yet exist, it gets created
// via the command through the channel, so we can not await it here.
// Therefore, would be good to have a separate status entry out of
// the ca_conn_ress right here in a sync fashion.
*state = WithAddressState::Assigned(cs);
let item = QueryItem::ChannelStatus(ChannelStatusItem {
ts: tsnow,
series: SeriesId::new(status_series_id.id()),
status: scywr::iteminsertqueue::ChannelStatus::AssignedToAddress,
});
match self.storage_insert_tx.send(item).await {
Ok(_) => {}
Err(_) => {
// TODO feed into throttled log, or count as unlogged
}
}
}
}
Assigned(_) => {
// TODO check if channel is healthy and alive
}
}
}
WithStatusSeriesIdStateInner::NoAddress { since } => {
let dt = tsnow.duration_since(*since).unwrap_or(Duration::ZERO);
if dt > NO_ADDRESS_STAY {
state.inner = WithStatusSeriesIdStateInner::UnknownAddress { since: tsnow };
}
}
},
},
ChannelStateValue::ToRemove { .. } => {
// TODO if assigned to some address,
}
}
if i >= CHECK_CHANS_PER_TICK {
self.chan_check_next = Some(ch.clone());
break;
}
}
Ok(())
}
fn update_channel_state_counts(&mut self) -> (u64,) {
let mut unknown_address_count = 0;
let mut search_pending_count = 0;
let mut search_pending_did_send_count = 0;
let mut unassigned_count = 0;
let mut assigned_count = 0;
let mut no_address_count = 0;
for (_ch, st) in self.channel_states.inner().iter() {
match &st.value {
ChannelStateValue::Active(st2) => match st2 {
ActiveChannelState::Init { .. } => {
unknown_address_count += 1;
}
ActiveChannelState::WaitForStatusSeriesId { .. } => {
unknown_address_count += 1;
}
ActiveChannelState::WithStatusSeriesId { state, .. } => match &state.inner {
WithStatusSeriesIdStateInner::UnknownAddress { .. } => {
unknown_address_count += 1;
}
WithStatusSeriesIdStateInner::SearchPending { did_send, .. } => {
if *did_send {
search_pending_did_send_count += 1;
} else {
search_pending_count += 1;
}
}
WithStatusSeriesIdStateInner::WithAddress { state, .. } => match state {
WithAddressState::Unassigned { .. } => {
unassigned_count += 1;
}
WithAddressState::Assigned(_) => {
assigned_count += 1;
}
},
WithStatusSeriesIdStateInner::NoAddress { .. } => {
no_address_count += 1;
}
},
},
ChannelStateValue::ToRemove { .. } => {
unknown_address_count += 1;
}
}
}
use atomic::Ordering::Release;
self.stats.channel_unknown_address.store(unknown_address_count, Release);
self.stats.channel_search_pending.store(search_pending_count, Release);
self.stats
.search_pending_did_send
.store(search_pending_did_send_count, Release);
self.stats.unassigned.store(unassigned_count, Release);
self.stats.assigned.store(assigned_count, Release);
self.stats.channel_no_address.store(no_address_count, Release);
(search_pending_count,)
}
}

355
netfetch/src/ca/finder.rs Normal file
View File

@@ -0,0 +1,355 @@
use super::connset::CaConnSetEvent;
use super::connset::IocAddrQuery;
use super::connset::CURRENT_SEARCH_PENDING_MAX;
use super::connset::SEARCH_BATCH_MAX;
use crate::ca::findioc::FindIocRes;
use crate::ca::findioc::FindIocStream;
use crate::daemon_common::DaemonEvent;
use async_channel::Receiver;
use async_channel::Sender;
use dbpg::conn::make_pg_client;
use dbpg::postgres::Row as PgRow;
use err::Error;
use futures_util::FutureExt;
use futures_util::StreamExt;
use log::*;
use netpod::Database;
use std::collections::HashMap;
use std::collections::VecDeque;
use std::net::SocketAddrV4;
use std::sync::atomic;
use std::sync::atomic::AtomicUsize;
use std::time::Duration;
use std::time::Instant;
use taskrun::tokio;
use tokio::task::JoinHandle;
const SEARCH_DB_PIPELINE_LEN: usize = 4;
const FINDER_JOB_QUEUE_LEN_MAX: usize = 10;
const FINDER_BATCH_SIZE: usize = 8;
const FINDER_IN_FLIGHT_MAX: usize = 800;
const FINDER_TIMEOUT: Duration = Duration::from_millis(100);
// TODO pull out into a stats
static SEARCH_REQ_BATCH_RECV_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_RES_0_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_RES_1_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_RES_2_COUNT: AtomicUsize = AtomicUsize::new(0);
static SEARCH_RES_3_COUNT: AtomicUsize = AtomicUsize::new(0);
#[allow(unused)]
macro_rules! debug_batch {
// (D$($arg:tt)*) => ();
($($arg:tt)*) => (if false {
debug!($($arg)*);
});
}
#[allow(unused)]
macro_rules! trace_batch {
// (D$($arg:tt)*) => ();
($($arg:tt)*) => (if false {
trace!($($arg)*);
});
}
#[derive(Debug)]
pub struct IocAddrQueryResult {}
fn transform_pgres(rows: Vec<PgRow>) -> VecDeque<FindIocRes> {
let mut ret = VecDeque::new();
for row in rows {
let ch: Result<String, _> = row.try_get(0);
if let Ok(ch) = ch {
if let Some(addr) = row.get::<_, Option<String>>(1) {
let addr = addr.parse().map_or(None, |x| Some(x));
let item = FindIocRes {
channel: ch,
response_addr: None,
addr,
dt: Duration::from_millis(0),
};
ret.push_back(item);
} else {
let item = FindIocRes {
channel: ch,
response_addr: None,
addr: None,
dt: Duration::from_millis(0),
};
ret.push_back(item);
}
} else if let Err(e) = ch {
error!("bad string from pg: {e:?}");
}
}
ret
}
async fn finder_worker_single(
inp: Receiver<Vec<IocAddrQuery>>,
tx: Sender<CaConnSetEvent>,
backend: String,
db: Database,
) -> Result<(), Error> {
let pg = make_pg_client(&db)
.await
.map_err(|e| Error::with_msg_no_trace(e.to_string()))?;
let sql = concat!(
"with q1 as (select * from unnest($2::text[]) as unn (ch))",
" select distinct on (tt.facility, tt.channel) tt.channel, tt.addr",
" from ioc_by_channel_log tt join q1 on tt.channel = q1.ch and tt.facility = $1 and tt.addr is not null",
" order by tt.facility, tt.channel, tsmod desc",
);
let qu_select_multi = pg
.prepare(sql)
.await
.map_err(|e| Error::with_msg_no_trace(e.to_string()))?;
let mut resdiff = 0;
loop {
match inp.recv().await {
Ok(batch) => {
SEARCH_REQ_BATCH_RECV_COUNT.fetch_add(batch.len(), atomic::Ordering::AcqRel);
let ts1 = Instant::now();
debug_batch!("run query batch len {}", batch.len());
let names: Vec<_> = batch.iter().map(|x| x.name.as_str()).collect();
let qres = pg.query(&qu_select_multi, &[&backend, &names]).await;
let dt = ts1.elapsed();
debug_batch!(
"done query batch len {}: {} {:.3}ms",
batch.len(),
qres.is_ok(),
dt.as_secs_f32() * 1e3
);
if dt > Duration::from_millis(5000) {
let mut out = String::from("[");
for e in &batch {
if out.len() > 1 {
out.push_str(", ");
}
out.push('\'');
out.push_str(&e.name);
out.push('\'');
}
out.push(']');
eprintln!("VERY SLOW QUERY\n{out}");
}
match qres {
Ok(rows) => {
if rows.len() > batch.len() {
error!("MORE RESULTS THAN INPUT");
} else if rows.len() < batch.len() {
resdiff += batch.len() - rows.len();
}
let nbatch = batch.len();
trace_batch!("received results {} resdiff {}", rows.len(), resdiff);
SEARCH_RES_0_COUNT.fetch_add(rows.len(), atomic::Ordering::AcqRel);
let items = transform_pgres(rows);
let names: HashMap<_, _> = items.iter().map(|x| (&x.channel, true)).collect();
let mut to_add = Vec::new();
for e in batch {
let s = e.name;
if !names.contains_key(&s) {
let item = FindIocRes {
channel: s,
response_addr: None,
addr: None,
dt: Duration::from_millis(0),
};
to_add.push(item);
}
}
SEARCH_RES_1_COUNT.fetch_add(items.len(), atomic::Ordering::AcqRel);
SEARCH_RES_2_COUNT.fetch_add(to_add.len(), atomic::Ordering::AcqRel);
let mut items = items;
items.extend(to_add.into_iter());
if items.len() != nbatch {
error!("STILL NOT MATCHING LEN");
}
SEARCH_RES_3_COUNT.fetch_add(items.len(), atomic::Ordering::AcqRel);
let x = tx
.send(CaConnSetEvent::ConnSetCmd(
crate::ca::connset::ConnSetCmd::IocAddrQueryResult(items),
))
.await;
match x {
Ok(_) => {}
Err(e) => {
error!("finder sees: {e}");
break;
}
}
}
Err(e) => {
error!("finder sees error: {e}");
tokio::time::sleep(Duration::from_millis(1000)).await;
}
}
}
Err(_e) => break,
}
}
Ok(())
}
async fn finder_worker(
qrx: Receiver<IocAddrQuery>,
tx: Sender<CaConnSetEvent>,
backend: String,
db: Database,
) -> Result<(), Error> {
// TODO do something with join handle
let (batch_rx, _jh) = batchtools::batcher::batch(
SEARCH_BATCH_MAX,
Duration::from_millis(200),
SEARCH_DB_PIPELINE_LEN,
qrx,
);
for _ in 0..SEARCH_DB_PIPELINE_LEN {
// TODO use join handle
tokio::spawn(finder_worker_single(
batch_rx.clone(),
tx.clone(),
backend.clone(),
db.clone(),
));
}
Ok(())
}
pub fn start_finder(
tx: Sender<CaConnSetEvent>,
backend: String,
db: Database,
) -> (Sender<IocAddrQuery>, JoinHandle<Result<(), Error>>) {
let (qtx, qrx) = async_channel::bounded(CURRENT_SEARCH_PENDING_MAX);
let jh = taskrun::spawn(finder_worker(qrx, tx, backend, db));
(qtx, jh)
}
struct OptFut<F> {
fut: Option<F>,
}
impl<F> OptFut<F> {
fn empty() -> Self {
Self { fut: None }
}
fn new(fut: F) -> Self {
Self { fut: Some(fut) }
}
fn is_enabled(&self) -> bool {
self.fut.is_some()
}
}
impl<F> futures_util::Future for OptFut<F>
where
F: futures_util::Future + std::marker::Unpin,
{
type Output = <F as futures_util::Future>::Output;
fn poll(mut self: std::pin::Pin<&mut Self>, cx: &mut std::task::Context) -> std::task::Poll<Self::Output> {
match self.fut.as_mut() {
Some(fut) => fut.poll_unpin(cx),
None => std::task::Poll::Pending,
}
}
}
#[allow(unused)]
fn start_finder_ca(tx: Sender<DaemonEvent>, tgts: Vec<SocketAddrV4>) -> (Sender<String>, JoinHandle<()>) {
let (qtx, qrx) = async_channel::bounded(32);
let (atx, arx) = async_channel::bounded(32);
let ioc_finder_fut = async move {
let mut finder = FindIocStream::new(tgts, FINDER_TIMEOUT, FINDER_IN_FLIGHT_MAX, FINDER_BATCH_SIZE);
let fut_tick_dur = Duration::from_millis(100);
let mut finder_more = true;
let mut finder_fut = OptFut::new(finder.next());
let mut qrx_fut = OptFut::new(qrx.recv());
let mut qrx_more = true;
let mut fut_tick = Box::pin(tokio::time::sleep(fut_tick_dur));
let mut asend = OptFut::empty();
loop {
tokio::select! {
_ = &mut asend, if asend.is_enabled() => {
asend = OptFut::empty();
}
r1 = &mut finder_fut, if finder_fut.is_enabled() => {
finder_fut = OptFut::empty();
match r1 {
Some(item) => {
asend = OptFut::new(atx.send(item));
}
None => {
// TODO finder has stopped, do no longer poll on it
warn!("Finder has stopped");
finder_more = false;
}
}
if qrx_more && finder.job_queue_len() < FINDER_JOB_QUEUE_LEN_MAX {
qrx_fut = OptFut::new(qrx.recv());
}
if finder_more {
finder_fut = OptFut::new(finder.next());
}
fut_tick = Box::pin(tokio::time::sleep(fut_tick_dur));
}
r2 = &mut qrx_fut, if qrx_fut.is_enabled() => {
qrx_fut = OptFut::empty();
match r2 {
Ok(item) => {
finder.push(item);
}
Err(e) => {
// TODO input is done... ignore from here on.
error!("Finder input channel error {e}");
qrx_more = false;
}
}
if qrx_more && finder.job_queue_len() < FINDER_JOB_QUEUE_LEN_MAX {
qrx_fut = OptFut::new(qrx.recv());
}
if finder_more {
finder_fut = OptFut::new(finder.next());
} else {
finder_fut = OptFut::empty();
}
fut_tick = Box::pin(tokio::time::sleep(fut_tick_dur));
}
_ = &mut fut_tick => {
if qrx_more && finder.job_queue_len() < FINDER_JOB_QUEUE_LEN_MAX {
qrx_fut = OptFut::new(qrx.recv());
}
if finder_more {
finder_fut = OptFut::new(finder.next());
} else {
finder_fut = OptFut::empty();
}
fut_tick = Box::pin(tokio::time::sleep(fut_tick_dur));
}
else => {
error!("all branches are disabled");
break;
}
};
}
};
let ioc_finder_jh = taskrun::spawn(ioc_finder_fut);
taskrun::spawn({
async move {
while let Ok(item) = arx.recv().await {
match tx.send(DaemonEvent::SearchDone(item)).await {
Ok(_) => {}
Err(e) => {
error!("search res fwd {e}");
}
}
}
warn!("search res fwd nput broken");
}
});
(qtx, ioc_finder_jh)
}

123
netfetch/src/ca/statemap.rs Normal file
View File

@@ -0,0 +1,123 @@
use crate::daemon_common::Channel;
use async_channel::Receiver;
use serde::Serialize;
use series::series::Existence;
use series::ChannelStatusSeriesId;
use series::SeriesId;
use std::collections::BTreeMap;
use std::net::SocketAddrV4;
use std::time::Instant;
use std::time::SystemTime;
pub const CHANNEL_STATUS_DUMMY_SCALAR_TYPE: i32 = i32::MIN + 1;
#[derive(Debug)]
pub enum CaConnStateValue {
Fresh,
HadFeedback,
Shutdown { since: Instant },
}
#[derive(Debug)]
pub struct CaConnState {
pub last_feedback: Instant,
pub value: CaConnStateValue,
}
impl CaConnState {
pub fn new(value: CaConnStateValue) -> Self {
Self {
last_feedback: Instant::now(),
value,
}
}
}
#[derive(Clone, Debug, Serialize)]
pub enum ConnectionStateValue {
Unconnected,
Connected {
//#[serde(with = "serde_Instant")]
since: SystemTime,
},
}
#[derive(Clone, Debug, Serialize)]
pub struct ConnectionState {
//#[serde(with = "serde_Instant")]
pub updated: SystemTime,
pub value: ConnectionStateValue,
}
#[derive(Clone, Debug, Serialize)]
pub enum WithAddressState {
Unassigned {
//#[serde(with = "serde_Instant")]
assign_at: SystemTime,
},
Assigned(ConnectionState),
}
#[derive(Clone, Debug, Serialize)]
pub enum WithStatusSeriesIdStateInner {
UnknownAddress {
since: SystemTime,
},
SearchPending {
//#[serde(with = "serde_Instant")]
since: SystemTime,
did_send: bool,
},
WithAddress {
addr: SocketAddrV4,
state: WithAddressState,
},
NoAddress {
since: SystemTime,
},
}
#[derive(Clone, Debug, Serialize)]
pub struct WithStatusSeriesIdState {
pub inner: WithStatusSeriesIdStateInner,
}
#[derive(Clone, Debug)]
pub enum ActiveChannelState {
Init {
since: SystemTime,
},
WaitForStatusSeriesId {
since: SystemTime,
},
WithStatusSeriesId {
status_series_id: ChannelStatusSeriesId,
state: WithStatusSeriesIdState,
},
}
#[derive(Debug)]
pub enum ChannelStateValue {
Active(ActiveChannelState),
ToRemove { addr: Option<SocketAddrV4> },
}
#[derive(Debug)]
pub struct ChannelState {
pub value: ChannelStateValue,
}
#[derive(Debug)]
pub struct ChannelStateMap {
map: BTreeMap<Channel, ChannelState>,
}
impl ChannelStateMap {
pub fn new() -> Self {
Self { map: BTreeMap::new() }
}
pub fn inner(&mut self) -> &mut BTreeMap<Channel, ChannelState> {
&mut self.map
}
}