Add alternative time binner

This commit is contained in:
Dominik Werder
2022-11-18 12:59:55 +01:00
parent eebf8665ce
commit b3225ae4c1
17 changed files with 1283 additions and 177 deletions

View File

@@ -1,5 +1,6 @@
use crate::{ChannelEvents, Error, MergableEvents};
use crate::Error;
use futures_util::{Stream, StreamExt};
use items::sitem_data;
use items::{RangeCompletableItem, Sitemty, StreamItem};
use netpod::log::*;
use std::fmt;
@@ -7,50 +8,126 @@ use std::ops::ControlFlow;
use std::pin::Pin;
use std::task::{Context, Poll};
type MergeInp = Pin<Box<dyn Stream<Item = Sitemty<ChannelEvents>> + Send>>;
#[allow(unused)]
macro_rules! trace2 {
($($arg:tt)*) => ();
($($arg:tt)*) => (eprintln!($($arg)*));
}
pub struct ChannelEventsMerger {
inps: Vec<Option<MergeInp>>,
items: Vec<Option<ChannelEvents>>,
#[allow(unused)]
macro_rules! trace3 {
($($arg:tt)*) => ();
($($arg:tt)*) => (eprintln!($($arg)*));
}
#[allow(unused)]
macro_rules! trace4 {
($($arg:tt)*) => ();
($($arg:tt)*) => (eprintln!($($arg)*));
}
#[derive(Debug)]
pub enum MergeError {
NotCompatible,
Full,
}
impl From<MergeError> for err::Error {
fn from(e: MergeError) -> Self {
format!("{e:?}").into()
}
}
pub trait Mergeable<Rhs = Self>: fmt::Debug + Unpin {
fn len(&self) -> usize;
fn ts_min(&self) -> Option<u64>;
fn ts_max(&self) -> Option<u64>;
// TODO remove, useless.
fn is_compatible_target(&self, tgt: &Rhs) -> bool;
// TODO rename to `append_*` to make it clear that they simply append, but not re-sort.
fn move_into_fresh(&mut self, ts_end: u64) -> Rhs;
fn move_into_existing(&mut self, tgt: &mut Rhs, ts_end: u64) -> Result<(), MergeError>;
}
type MergeInp<T> = Pin<Box<dyn Stream<Item = Sitemty<T>> + Send>>;
pub struct Merger<T> {
inps: Vec<Option<MergeInp<T>>>,
items: Vec<Option<T>>,
out: Option<T>,
do_clear_out: bool,
out_max_len: usize,
range_complete: bool,
done: bool,
done2: bool,
done3: bool,
complete: bool,
}
impl fmt::Debug for ChannelEventsMerger {
impl<T> fmt::Debug for Merger<T>
where
T: Mergeable,
{
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
let inps: Vec<_> = self.inps.iter().map(|x| x.is_some()).collect();
fmt.debug_struct(std::any::type_name::<Self>())
.field("inps", &inps)
.field("items", &self.items)
.field("out_max_len", &self.out_max_len)
.field("range_complete", &self.range_complete)
.field("done", &self.done)
.field("done2", &self.done2)
.field("done3", &self.done3)
.finish()
}
}
impl ChannelEventsMerger {
pub fn new(inps: Vec<MergeInp>) -> Self {
impl<T> Merger<T>
where
T: Mergeable,
{
pub fn new(inps: Vec<MergeInp<T>>, out_max_len: usize) -> Self {
let n = inps.len();
Self {
done: false,
done2: false,
complete: false,
inps: inps.into_iter().map(|x| Some(x)).collect(),
items: (0..n).into_iter().map(|_| None).collect(),
out: None,
do_clear_out: false,
out_max_len,
range_complete: false,
done: false,
done2: false,
done3: false,
complete: false,
}
}
fn process(mut self: Pin<&mut Self>, _cx: &mut Context) -> Result<ControlFlow<ChannelEvents>, Error> {
fn take_into_output_all(&mut self, src: &mut T) -> Result<(), MergeError> {
// TODO optimize the case when some large batch should be added to some existing small batch already in out.
// TODO maybe use two output slots?
self.take_into_output_upto(src, u64::MAX)
}
fn take_into_output_upto(&mut self, src: &mut T, upto: u64) -> Result<(), MergeError> {
// TODO optimize the case when some large batch should be added to some existing small batch already in out.
// TODO maybe use two output slots?
if self.out.is_none() {
trace2!("move into fresh");
self.out = Some(src.move_into_fresh(upto));
Ok(())
} else {
let out = self.out.as_mut().unwrap();
src.move_into_existing(out, upto)
}
}
fn process(mut self: Pin<&mut Self>, _cx: &mut Context) -> Result<ControlFlow<()>, Error> {
use ControlFlow::*;
let mut tslows = [None, None];
for (i1, itemopt) in self.items.iter_mut().enumerate() {
if let Some(item) = itemopt {
let t1 = item.ts_min();
if let Some(t1) = t1 {
if let Some(t1) = item.ts_min() {
if let Some((_, a)) = tslows[0] {
if t1 < a {
tslows[1] = tslows[0];
@@ -70,58 +147,72 @@ impl ChannelEventsMerger {
tslows[0] = Some((i1, t1));
}
} else {
match item {
ChannelEvents::Events(_) => {
trace!("events item without ts min discovered {item:?}");
itemopt.take();
return Ok(Continue(()));
}
ChannelEvents::Status(_) => {
return Err(format!("channel status without timestamp").into());
}
}
// the item seems empty.
trace2!("empty item, something to do here?");
*itemopt = None;
return Ok(Continue(()));
}
}
}
trace4!("tslows {tslows:?}");
if let Some((il0, _tl0)) = tslows[0] {
if let Some((_il1, tl1)) = tslows[1] {
// There is a second input, take only up to the second highest timestamp
let item = self.items[il0].as_mut().unwrap();
match item {
ChannelEvents::Events(item) => {
if let Some(th0) = item.ts_max() {
if th0 < tl1 {
let ret = self.items[il0].take().unwrap();
Ok(Break(ret))
} else {
let ritem = item.take_new_events_until_ts(tl1);
if item.len() == 0 {
// TODO should never be here
self.items[il0] = None;
}
Ok(Break(ChannelEvents::Events(ritem)))
if let Some(th0) = item.ts_max() {
if th0 <= tl1 {
// Can take the whole item
let mut item = self.items[il0].take().unwrap();
trace3!("Take all from item {item:?}");
match self.take_into_output_all(&mut item) {
Ok(()) => Ok(Break(())),
Err(MergeError::Full) | Err(MergeError::NotCompatible) => {
// TODO count for stats
trace3!("Put item back");
self.items[il0] = Some(item);
self.do_clear_out = true;
Ok(Break(()))
}
}
} else {
// Take only up to the lowest ts of the second-lowest input
let mut item = self.items[il0].take().unwrap();
trace3!("Take up to {tl1} from item {item:?}");
match self.take_into_output_upto(&mut item, tl1) {
Ok(()) => {
if item.len() == 0 {
// TODO should never be here because we should have taken the whole item
Err(format!("Should have taken the whole item instead").into())
} else {
self.items[il0] = Some(item);
Ok(Break(()))
}
}
Err(MergeError::Full) | Err(MergeError::NotCompatible) => {
// TODO count for stats
trace3!("Put item back");
self.items[il0] = Some(item);
self.do_clear_out = true;
Ok(Break(()))
}
} else {
// TODO should never be here because ts-max should always exist here.
let ritem = item.take_new_events_until_ts(tl1);
if item.len() == 0 {}
Ok(Break(ChannelEvents::Events(ritem)))
}
}
ChannelEvents::Status(_) => {
let ret = self.items[il0].take().unwrap();
Ok(Break(ret))
}
} else {
// TODO should never be here because ts-max should always exist here.
Err(format!("selected input without max ts").into())
}
} else {
let item = self.items[il0].as_mut().unwrap();
match item {
ChannelEvents::Events(_) => {
let ret = self.items[il0].take().unwrap();
Ok(Break(ret))
}
ChannelEvents::Status(_) => {
let ret = self.items[il0].take().unwrap();
Ok(Break(ret))
// No other input, take the whole item
let mut item = self.items[il0].take().unwrap();
trace3!("Take all from item (no other input) {item:?}");
match self.take_into_output_all(&mut item) {
Ok(()) => Ok(Break(())),
Err(_) => {
// TODO count for stats
trace3!("Put item back");
self.items[il0] = Some(item);
self.do_clear_out = true;
Ok(Break(()))
}
}
}
@@ -131,6 +222,7 @@ impl ChannelEventsMerger {
}
fn refill(mut self: Pin<&mut Self>, cx: &mut Context) -> ControlFlow<Poll<Error>> {
trace4!("refill");
use ControlFlow::*;
use Poll::*;
let mut has_pending = false;
@@ -138,6 +230,7 @@ impl ChannelEventsMerger {
let item = &self.items[i1];
if item.is_none() {
while let Some(inp) = &mut self.inps[i1] {
trace4!("refill while");
match inp.poll_next_unpin(cx) {
Ready(Some(Ok(k))) => {
match k {
@@ -149,22 +242,6 @@ impl ChannelEventsMerger {
eprintln!("TODO inp RangeComplete which does not fill slot");
}
RangeCompletableItem::Data(k) => {
match &k {
ChannelEvents::Events(events) => {
if events.len() == 0 {
warn!("empty events item {events:?}");
} else {
trace!(
"\nrefilled with events {}\nREFILLED\n{:?}\n\n",
events.len(),
events
);
}
}
ChannelEvents::Status(_) => {
eprintln!("TODO inp Status which does not fill slot");
}
}
self.items[i1] = Some(k);
break;
}
@@ -186,6 +263,8 @@ impl ChannelEventsMerger {
}
}
}
} else {
trace4!("refill inp {} has {}", i1, item.as_ref().unwrap().len());
}
}
if has_pending {
@@ -195,17 +274,13 @@ impl ChannelEventsMerger {
}
}
fn poll2(mut self: Pin<&mut Self>, cx: &mut Context) -> ControlFlow<Poll<Option<Result<ChannelEvents, Error>>>> {
fn poll3(
mut self: Pin<&mut Self>,
cx: &mut Context,
has_pending: bool,
) -> ControlFlow<Poll<Option<Result<T, Error>>>> {
use ControlFlow::*;
use Poll::*;
let mut has_pending = false;
match Self::refill(Pin::new(&mut self), cx) {
Break(Ready(e)) => return Break(Ready(Some(Err(e)))),
Break(Pending) => {
has_pending = true;
}
Continue(()) => {}
}
let ninps = self.inps.iter().filter(|a| a.is_some()).count();
let nitems = self.items.iter().filter(|a| a.is_some()).count();
let nitemsmissing = self
@@ -214,6 +289,7 @@ impl ChannelEventsMerger {
.zip(self.items.iter())
.filter(|(a, b)| a.is_some() && b.is_none())
.count();
trace3!("ninps {ninps} nitems {nitems} nitemsmissing {nitemsmissing}");
if ninps == 0 && nitems == 0 {
self.done = true;
Break(Ready(None))
@@ -226,33 +302,74 @@ impl ChannelEventsMerger {
}
} else {
match Self::process(Pin::new(&mut self), cx) {
Ok(Break(item)) => Break(Ready(Some(Ok(item)))),
Ok(Continue(())) => Continue(()),
Ok(Break(())) => {
if let Some(o) = self.out.as_ref() {
// A good threshold varies according to scalar type and shape.
// TODO replace this magic number by a bound on the bytes estimate.
if o.len() >= self.out_max_len || self.do_clear_out {
trace3!("decide to output");
self.do_clear_out = false;
Break(Ready(Some(Ok(self.out.take().unwrap()))))
} else {
trace4!("output not yet");
Continue(())
}
} else {
trace3!("no output candidate");
Continue(())
}
}
Ok(Continue(())) => {
trace2!("process returned with Continue");
Continue(())
}
Err(e) => Break(Ready(Some(Err(e)))),
}
}
}
fn poll2(mut self: Pin<&mut Self>, cx: &mut Context) -> ControlFlow<Poll<Option<Result<T, Error>>>> {
use ControlFlow::*;
use Poll::*;
match Self::refill(Pin::new(&mut self), cx) {
Break(Ready(e)) => Break(Ready(Some(Err(e)))),
Break(Pending) => Self::poll3(self, cx, true),
Continue(()) => Self::poll3(self, cx, false),
}
}
}
impl Stream for ChannelEventsMerger {
type Item = Sitemty<ChannelEvents>;
impl<T> Stream for Merger<T>
where
T: Mergeable,
{
type Item = Sitemty<T>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
use Poll::*;
const NAME: &str = "ChannelEventsMerger";
const NAME: &str = "Merger_mergeable";
let span = span!(Level::TRACE, NAME);
let _spanguard = span.enter();
loop {
trace3!("{NAME} poll");
break if self.complete {
panic!("poll after complete");
} else if self.done2 {
} else if self.done3 {
self.complete = true;
Ready(None)
} else if self.done2 {
self.done3 = true;
if self.range_complete {
warn!("TODO emit range complete only if all inputs signaled complete");
trace!("{NAME} emit RangeComplete");
Ready(Some(Ok(StreamItem::DataItem(RangeCompletableItem::RangeComplete))))
} else {
continue;
}
} else if self.done {
self.done2 = true;
if self.range_complete {
trace!("MERGER EMITTING ChannelEvents::RangeComplete");
Ready(Some(Ok(StreamItem::DataItem(RangeCompletableItem::RangeComplete))))
if let Some(out) = self.out.take() {
Ready(Some(sitem_data(out)))
} else {
continue;
}
@@ -260,18 +377,7 @@ impl Stream for ChannelEventsMerger {
match Self::poll2(self.as_mut(), cx) {
ControlFlow::Continue(()) => continue,
ControlFlow::Break(k) => match k {
Ready(Some(Ok(ChannelEvents::Events(item)))) => {
trace!("\n\nMERGER EMITTING\n{:?}\n\n", item);
Ready(Some(Ok(StreamItem::DataItem(RangeCompletableItem::Data(
ChannelEvents::Events(item),
)))))
}
Ready(Some(Ok(ChannelEvents::Status(item)))) => {
trace!("\n\nMERGER EMITTING\n{:?}\n\n", item);
Ready(Some(Ok(StreamItem::DataItem(RangeCompletableItem::Data(
ChannelEvents::Status(item),
)))))
}
Ready(Some(Ok(item))) => Ready(Some(sitem_data(item))),
Ready(Some(Err(e))) => {
self.done = true;
Ready(Some(Err(e.into())))