Open data via index file

This commit is contained in:
Dominik Werder
2021-04-28 11:35:06 +02:00
parent 09ea4175dc
commit 0204c37017
23 changed files with 966 additions and 709 deletions

View File

@@ -3,20 +3,22 @@ Aggregation and binning support.
*/
use super::eventchunker::EventFull;
use bytes::{BufMut, Bytes, BytesMut};
use crate::agg::eventbatch::MinMaxAvgScalarEventBatch;
use err::Error;
use futures_core::Stream;
use futures_util::StreamExt;
use netpod::timeunits::SEC;
use netpod::{BinSpecDimT, NanoRange};
use netpod::NanoRange;
use netpod::{Node, ScalarType};
use serde::{Deserialize, Serialize};
use std::mem::size_of;
use std::pin::Pin;
use std::task::{Context, Poll};
#[allow(unused_imports)]
use tracing::{debug, error, info, span, trace, warn, Level};
pub mod binnedt;
pub mod binnedx;
pub mod eventbatch;
pub mod scalarbinbatch;
pub trait AggregatorTdim {
type InputValue;
type OutputValue: AggregatableXdim1Bin + AggregatableTdim;
@@ -35,7 +37,7 @@ pub trait AggregatableXdim1Bin {
pub trait AggregatableTdim {
type Output: AggregatableXdim1Bin + AggregatableTdim;
type Aggregator: AggregatorTdim<InputValue = Self>;
fn aggregator_new(&self, ts1: u64, ts2: u64) -> Self::Aggregator;
fn aggregator_new_static(ts1: u64, ts2: u64) -> Self::Aggregator;
}
/// DO NOT USE. This is just a dummy for some testing.
@@ -49,7 +51,7 @@ impl AggregatableXdim1Bin for () {
impl AggregatableTdim for () {
type Output = ();
type Aggregator = ();
fn aggregator_new(&self, _ts1: u64, _ts2: u64) -> Self::Aggregator {
fn aggregator_new_static(_ts1: u64, _ts2: u64) -> Self::Aggregator {
todo!()
}
}
@@ -201,329 +203,6 @@ impl AggregatableXdim1Bin for ValuesDim0 {
}
}
#[derive(Serialize, Deserialize)]
pub struct MinMaxAvgScalarEventBatch {
pub tss: Vec<u64>,
pub mins: Vec<f32>,
pub maxs: Vec<f32>,
pub avgs: Vec<f32>,
}
impl MinMaxAvgScalarEventBatch {
pub fn empty() -> Self {
Self {
tss: vec![],
mins: vec![],
maxs: vec![],
avgs: vec![],
}
}
pub fn from_full_frame(buf: &Bytes) -> Self {
info!("construct MinMaxAvgScalarEventBatch from full frame len {}", buf.len());
assert!(buf.len() >= 4);
let mut g = MinMaxAvgScalarEventBatch::empty();
let n1;
unsafe {
let ptr = (&buf[0] as *const u8) as *const [u8; 4];
n1 = u32::from_le_bytes(*ptr);
trace!("--- +++ --- +++ --- +++ n1: {}", n1);
}
if n1 == 0 {
g
} else {
let n2 = n1 as usize;
g.tss.reserve(n2);
g.mins.reserve(n2);
g.maxs.reserve(n2);
g.avgs.reserve(n2);
unsafe {
// TODO Can I unsafely create ptrs and just assign them?
// TODO What are cases where I really need transmute?
g.tss.set_len(n2);
g.mins.set_len(n2);
g.maxs.set_len(n2);
g.avgs.set_len(n2);
let ptr0 = &buf[4] as *const u8;
{
let ptr1 = ptr0 as *const u64;
for i1 in 0..n2 {
g.tss[i1] = *ptr1.add(i1);
}
}
{
let ptr1 = ptr0.add((8) * n2) as *const f32;
for i1 in 0..n2 {
g.mins[i1] = *ptr1.add(i1);
}
}
{
let ptr1 = ptr0.add((8 + 4) * n2) as *const f32;
for i1 in 0..n2 {
g.maxs[i1] = *ptr1;
}
}
{
let ptr1 = ptr0.add((8 + 4 + 4) * n2) as *const f32;
for i1 in 0..n2 {
g.avgs[i1] = *ptr1;
}
}
}
info!("CONTENT {:?}", g);
g
}
}
}
impl std::fmt::Debug for MinMaxAvgScalarEventBatch {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
fmt,
"MinMaxAvgScalarEventBatch count {} tss {:?} mins {:?} maxs {:?} avgs {:?}",
self.tss.len(),
self.tss,
self.mins,
self.maxs,
self.avgs,
)
}
}
impl AggregatableXdim1Bin for MinMaxAvgScalarEventBatch {
type Output = MinMaxAvgScalarEventBatch;
fn into_agg(self) -> Self::Output {
self
}
}
impl AggregatableTdim for MinMaxAvgScalarEventBatch {
type Output = MinMaxAvgScalarBinBatch;
type Aggregator = MinMaxAvgScalarEventBatchAggregator;
fn aggregator_new(&self, ts1: u64, ts2: u64) -> Self::Aggregator {
MinMaxAvgScalarEventBatchAggregator::new(ts1, ts2)
}
}
impl MinMaxAvgScalarEventBatch {
#[allow(dead_code)]
fn old_serialized(&self) -> Bytes {
let n1 = self.tss.len();
let mut g = BytesMut::with_capacity(4 + n1 * (8 + 3 * 4));
g.put_u32_le(n1 as u32);
if n1 > 0 {
let ptr = &self.tss[0] as *const u64 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<u64>() * n1) };
g.put(a);
let ptr = &self.mins[0] as *const f32 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<f32>() * n1) };
g.put(a);
let ptr = &self.maxs[0] as *const f32 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<f32>() * n1) };
g.put(a);
let ptr = &self.avgs[0] as *const f32 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<f32>() * n1) };
g.put(a);
}
info!("impl Frameable for MinMaxAvgScalarEventBatch g.len() {}", g.len());
g.freeze()
}
}
pub struct MinMaxAvgScalarEventBatchAggregator {
ts1: u64,
ts2: u64,
count: u64,
min: f32,
max: f32,
sum: f32,
}
impl MinMaxAvgScalarEventBatchAggregator {
pub fn new(ts1: u64, ts2: u64) -> Self {
Self {
ts1,
ts2,
min: f32::MAX,
max: f32::MIN,
sum: 0f32,
count: 0,
}
}
}
impl AggregatorTdim for MinMaxAvgScalarEventBatchAggregator {
type InputValue = MinMaxAvgScalarEventBatch;
type OutputValue = MinMaxAvgScalarBinSingle;
fn ends_before(&self, inp: &Self::InputValue) -> bool {
match inp.tss.last() {
Some(ts) => *ts < self.ts1,
None => true,
}
}
fn ends_after(&self, inp: &Self::InputValue) -> bool {
match inp.tss.last() {
Some(ts) => *ts >= self.ts2,
_ => panic!(),
}
}
fn starts_after(&self, inp: &Self::InputValue) -> bool {
match inp.tss.first() {
Some(ts) => *ts >= self.ts2,
_ => panic!(),
}
}
fn ingest(&mut self, v: &Self::InputValue) {
for i1 in 0..v.tss.len() {
let ts = v.tss[i1];
if ts < self.ts1 {
//info!("EventBatchAgg {} {} {} {} IS BEFORE", v.tss[i1], v.mins[i1], v.maxs[i1], v.avgs[i1]);
continue;
} else if ts >= self.ts2 {
//info!("EventBatchAgg {} {} {} {} IS AFTER", v.tss[i1], v.mins[i1], v.maxs[i1], v.avgs[i1]);
continue;
} else {
//info!("EventBatchAgg {} {} {} {}", v.tss[i1], v.mins[i1], v.maxs[i1], v.avgs[i1]);
self.min = self.min.min(v.mins[i1]);
self.max = self.max.max(v.maxs[i1]);
self.sum += v.avgs[i1];
self.count += 1;
}
}
}
fn result(self) -> Self::OutputValue {
let min = if self.min == f32::MAX { f32::NAN } else { self.min };
let max = if self.max == f32::MIN { f32::NAN } else { self.max };
let avg = if self.count == 0 {
f32::NAN
} else {
self.sum / self.count as f32
};
MinMaxAvgScalarBinSingle {
ts1: self.ts1,
ts2: self.ts2,
count: self.count,
min,
max,
avg,
}
}
}
#[allow(dead_code)]
#[derive(Serialize, Deserialize)]
pub struct MinMaxAvgScalarBinBatch {
ts1s: Vec<u64>,
ts2s: Vec<u64>,
counts: Vec<u64>,
mins: Vec<f32>,
maxs: Vec<f32>,
avgs: Vec<f32>,
}
impl MinMaxAvgScalarBinBatch {
pub fn empty() -> Self {
Self {
ts1s: vec![],
ts2s: vec![],
counts: vec![],
mins: vec![],
maxs: vec![],
avgs: vec![],
}
}
pub fn len(&self) -> usize {
self.ts1s.len()
}
pub fn push_single(&mut self, g: &MinMaxAvgScalarBinSingle) {
self.ts1s.push(g.ts1);
self.ts2s.push(g.ts2);
self.counts.push(g.count);
self.mins.push(g.min);
self.maxs.push(g.max);
self.avgs.push(g.avg);
}
pub fn from_full_frame(buf: &Bytes) -> Self {
info!("MinMaxAvgScalarBinBatch construct from full frame len {}", buf.len());
assert!(buf.len() >= 4);
let mut g = MinMaxAvgScalarBinBatch::empty();
let n1;
unsafe {
let ptr = (&buf[0] as *const u8) as *const [u8; 4];
n1 = u32::from_le_bytes(*ptr);
trace!(
"MinMaxAvgScalarBinBatch construct --- +++ --- +++ --- +++ n1: {}",
n1
);
}
if n1 == 0 {
g
} else {
let n2 = n1 as usize;
g.ts1s.reserve(n2);
g.ts2s.reserve(n2);
g.counts.reserve(n2);
g.mins.reserve(n2);
g.maxs.reserve(n2);
g.avgs.reserve(n2);
unsafe {
// TODO Can I unsafely create ptrs and just assign them?
// TODO What are cases where I really need transmute?
g.ts1s.set_len(n2);
g.ts2s.set_len(n2);
g.counts.set_len(n2);
g.mins.set_len(n2);
g.maxs.set_len(n2);
g.avgs.set_len(n2);
let ptr0 = &buf[4] as *const u8;
{
let ptr1 = ptr0.add(0) as *const u64;
for i1 in 0..n2 {
g.ts1s[i1] = *ptr1.add(i1);
}
}
{
let ptr1 = ptr0.add((8) * n2) as *const u64;
for i1 in 0..n2 {
g.ts2s[i1] = *ptr1.add(i1);
}
}
{
let ptr1 = ptr0.add((8 + 8) * n2) as *const u64;
for i1 in 0..n2 {
g.counts[i1] = *ptr1.add(i1);
}
}
{
let ptr1 = ptr0.add((8 + 8 + 8) * n2) as *const f32;
for i1 in 0..n2 {
g.mins[i1] = *ptr1.add(i1);
}
}
{
let ptr1 = ptr0.add((8 + 8 + 8 + 4) * n2) as *const f32;
for i1 in 0..n2 {
g.maxs[i1] = *ptr1;
}
}
{
let ptr1 = ptr0.add((8 + 8 + 8 + 4 + 4) * n2) as *const f32;
for i1 in 0..n2 {
g.avgs[i1] = *ptr1;
}
}
}
info!("CONTENT {:?}", g);
g
}
}
}
pub enum Fits {
Empty,
Lower,
@@ -538,116 +217,6 @@ pub trait FitsInside {
fn fits_inside(&self, range: NanoRange) -> Fits;
}
impl FitsInside for MinMaxAvgScalarBinBatch {
fn fits_inside(&self, range: NanoRange) -> Fits {
if self.ts1s.is_empty() {
Fits::Empty
} else {
let t1 = *self.ts1s.first().unwrap();
let t2 = *self.ts2s.last().unwrap();
if t2 <= range.beg {
Fits::Lower
} else if t1 >= range.end {
Fits::Greater
} else if t1 < range.beg && t2 > range.end {
Fits::PartlyLowerAndGreater
} else if t1 < range.beg {
Fits::PartlyLower
} else if t2 > range.end {
Fits::PartlyGreater
} else {
Fits::Inside
}
}
}
}
impl MinMaxAvgScalarBinBatch {
#[allow(dead_code)]
fn old_serialized(&self) -> Bytes {
let n1 = self.ts1s.len();
let mut g = BytesMut::with_capacity(4 + n1 * (3 * 8 + 3 * 4));
g.put_u32_le(n1 as u32);
if n1 > 0 {
let ptr = &self.ts1s[0] as *const u64 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<u64>() * n1) };
g.put(a);
let ptr = &self.ts2s[0] as *const u64 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<u64>() * n1) };
g.put(a);
let ptr = &self.counts[0] as *const u64 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<u64>() * n1) };
g.put(a);
let ptr = &self.mins[0] as *const f32 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<f32>() * n1) };
g.put(a);
let ptr = &self.maxs[0] as *const f32 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<f32>() * n1) };
g.put(a);
let ptr = &self.avgs[0] as *const f32 as *const u8;
let a = unsafe { std::slice::from_raw_parts(ptr, size_of::<f32>() * n1) };
g.put(a);
}
g.freeze()
}
}
impl std::fmt::Debug for MinMaxAvgScalarBinBatch {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
fmt,
"MinMaxAvgScalarBinBatch count {} ts1s {:?} ts2s {:?} counts {:?} avgs {:?}",
self.ts1s.len(),
self.ts1s.iter().map(|k| k / SEC).collect::<Vec<_>>(),
self.ts2s.iter().map(|k| k / SEC).collect::<Vec<_>>(),
self.counts,
self.avgs
)
}
}
impl AggregatableXdim1Bin for MinMaxAvgScalarBinBatch {
type Output = MinMaxAvgScalarBinBatch;
fn into_agg(self) -> Self::Output {
todo!()
}
}
impl AggregatableTdim for MinMaxAvgScalarBinBatch {
type Output = MinMaxAvgScalarBinSingle;
type Aggregator = MinMaxAvgScalarBinBatchAggregator;
fn aggregator_new(&self, _ts1: u64, _ts2: u64) -> Self::Aggregator {
todo!()
}
}
pub struct MinMaxAvgScalarBinBatchAggregator {}
impl AggregatorTdim for MinMaxAvgScalarBinBatchAggregator {
type InputValue = MinMaxAvgScalarBinBatch;
type OutputValue = MinMaxAvgScalarBinSingle;
fn ends_before(&self, _inp: &Self::InputValue) -> bool {
todo!()
}
fn ends_after(&self, _inp: &Self::InputValue) -> bool {
todo!()
}
fn starts_after(&self, _inp: &Self::InputValue) -> bool {
todo!()
}
fn ingest(&mut self, _v: &Self::InputValue) {
todo!()
}
fn result(self) -> Self::OutputValue {
todo!()
}
}
pub struct MinMaxAvgScalarBinSingle {
ts1: u64,
ts2: u64,
@@ -670,7 +239,7 @@ impl std::fmt::Debug for MinMaxAvgScalarBinSingle {
impl AggregatableTdim for MinMaxAvgScalarBinSingle {
type Output = MinMaxAvgScalarBinSingle;
type Aggregator = MinMaxAvgScalarBinSingleAggregator;
fn aggregator_new(&self, _ts1: u64, _ts2: u64) -> Self::Aggregator {
fn aggregator_new_static(_ts1: u64, _ts2: u64) -> Self::Aggregator {
todo!()
}
}
@@ -871,187 +440,6 @@ where
}
}
pub trait IntoBinnedXBins1<I: AggregatableXdim1Bin> {
type StreamOut;
fn into_binned_x_bins_1(self) -> Self::StreamOut
where
Self: Stream<Item = Result<I, Error>>;
}
impl<T, I: AggregatableXdim1Bin> IntoBinnedXBins1<I> for T
where
T: Stream<Item = Result<I, Error>> + Unpin,
{
type StreamOut = IntoBinnedXBins1DefaultStream<T, I>;
fn into_binned_x_bins_1(self) -> Self::StreamOut {
IntoBinnedXBins1DefaultStream { inp: self }
}
}
pub struct IntoBinnedXBins1DefaultStream<S, I>
where
S: Stream<Item = Result<I, Error>> + Unpin,
I: AggregatableXdim1Bin,
{
inp: S,
}
impl<S, I> Stream for IntoBinnedXBins1DefaultStream<S, I>
where
S: Stream<Item = Result<I, Error>> + Unpin,
I: AggregatableXdim1Bin,
{
type Item = Result<I::Output, Error>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
use Poll::*;
match self.inp.poll_next_unpin(cx) {
Ready(Some(Ok(k))) => Ready(Some(Ok(k.into_agg()))),
Ready(Some(Err(e))) => Ready(Some(Err(e))),
Ready(None) => Ready(None),
Pending => Pending,
}
}
}
pub trait IntoBinnedT {
type StreamOut: Stream;
fn into_binned_t(self, spec: BinSpecDimT) -> Self::StreamOut;
}
impl<T, I> IntoBinnedT for T
where
I: AggregatableTdim + Unpin,
T: Stream<Item = Result<I, Error>> + Unpin,
I::Aggregator: Unpin,
{
type StreamOut = IntoBinnedTDefaultStream<T, I>;
fn into_binned_t(self, spec: BinSpecDimT) -> Self::StreamOut {
IntoBinnedTDefaultStream::new(self, spec)
}
}
pub struct IntoBinnedTDefaultStream<S, I>
where
I: AggregatableTdim,
S: Stream<Item = Result<I, Error>>,
{
inp: S,
aggtor: Option<I::Aggregator>,
spec: BinSpecDimT,
curbin: u32,
left: Option<Poll<Option<Result<I, Error>>>>,
errored: bool,
completed: bool,
inp_completed: bool,
}
impl<S, I> IntoBinnedTDefaultStream<S, I>
where
I: AggregatableTdim,
S: Stream<Item = Result<I, Error>>,
{
pub fn new(inp: S, spec: BinSpecDimT) -> Self {
//info!("spec ts {} {}", spec.ts1, spec.ts2);
Self {
inp,
aggtor: None,
spec,
curbin: 0,
left: None,
errored: false,
completed: false,
inp_completed: false,
}
}
}
impl<T, I> Stream for IntoBinnedTDefaultStream<T, I>
where
I: AggregatableTdim + Unpin,
T: Stream<Item = Result<I, Error>> + Unpin,
I::Aggregator: Unpin,
{
type Item = Result<<I::Aggregator as AggregatorTdim>::OutputValue, Error>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
use Poll::*;
if self.completed {
panic!("MergedFromRemotes ✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗✗ poll_next on completed");
}
if self.errored {
self.completed = true;
return Ready(None);
}
'outer: loop {
let cur = if self.curbin as u64 >= self.spec.count {
trace!("IntoBinnedTDefaultStream curbin out of spec, END");
Ready(None)
} else if let Some(k) = self.left.take() {
trace!("IntoBinnedTDefaultStream USE LEFTOVER");
k
} else if self.inp_completed {
Ready(None)
} else {
let inp_poll_span = span!(Level::TRACE, "into_t_inp_poll");
inp_poll_span.in_scope(|| self.inp.poll_next_unpin(cx))
};
break match cur {
Ready(Some(Ok(k))) => {
if self.aggtor.is_none() {
let range = self.spec.get_range(self.curbin);
//info!("range: {} {}", range.ts1, range.ts2);
self.aggtor = Some(k.aggregator_new(range.beg, range.end));
}
let ag = self.aggtor.as_mut().unwrap();
if ag.ends_before(&k) {
//info!("ENDS BEFORE");
continue 'outer;
} else if ag.starts_after(&k) {
//info!("STARTS AFTER");
self.left = Some(Ready(Some(Ok(k))));
self.curbin += 1;
Ready(Some(Ok(self.aggtor.take().unwrap().result())))
} else {
//info!("INGEST");
ag.ingest(&k);
// if this input contains also data after the current bin, then I need to keep
// it for the next round.
if ag.ends_after(&k) {
//info!("ENDS AFTER");
self.left = Some(Ready(Some(Ok(k))));
self.curbin += 1;
Ready(Some(Ok(self.aggtor.take().unwrap().result())))
} else {
//info!("ENDS WITHIN");
continue 'outer;
}
}
}
Ready(Some(Err(e))) => {
error!("IntoBinnedTDefaultStream err from input");
self.errored = true;
Ready(Some(Err(e)))
}
Ready(None) => {
self.inp_completed = true;
match self.aggtor.take() {
Some(ag) => Ready(Some(Ok(ag.result()))),
None => {
warn!("TODO add the trailing empty bins until requested range is complete");
self.completed = true;
Ready(None)
}
}
}
Pending => Pending,
};
}
}
}
pub fn make_test_node(id: u32) -> Node {
Node {
id,