Find start position in large files via binary search

This commit is contained in:
Dominik Werder
2021-07-30 22:14:07 +02:00
parent 002139bfae
commit 5e624bb2ca
9 changed files with 318 additions and 33 deletions

View File

@@ -1,3 +1,5 @@
use crate::generated::EPICSEvent::PayloadType;
use crate::parse::multi::parse_all_ts;
use crate::parse::PbFileReader;
use crate::{
EventsItem, MultiBinWaveEvents, PlainEvents, ScalarPlainEvents, SingleBinWaveEvents, WavePlainEvents, XBinnedEvents,
@@ -19,10 +21,12 @@ use netpod::timeunits::{DAY, SEC};
use netpod::{AggKind, ArchiverAppliance, Channel, ChannelInfo, HasScalarType, HasShape, NanoRange, ScalarType, Shape};
use serde::Serialize;
use serde_json::Value as JsonValue;
use std::io::SeekFrom;
use std::path::PathBuf;
use std::pin::Pin;
use std::task::{Context, Poll};
use tokio::fs::{read_dir, File};
use tokio::io::{AsyncReadExt, AsyncSeekExt};
pub struct DataFilename {
year: u32,
@@ -403,13 +407,31 @@ pub async fn make_single_event_pipe(
info!("•••••••••••••••••••••••••• file matches requested range");
let f1 = File::open(de.path()).await?;
info!("opened {:?}", de.path());
let z = position_file_for_evq(f1, evq.clone(), df.year).await?;
let mut f1 = if let PositionState::Positioned = z.state {
z.file
} else {
continue;
};
// TODO could avoid some seeks if position_file_for_evq would return the position instead of
// positioning the file.
let pos1 = f1.stream_position().await?;
f1.seek(SeekFrom::Start(0)).await?;
let mut pbr = PbFileReader::new(f1).await;
pbr.read_header().await?;
info!("✓ read header {:?}", pbr.payload_type());
// TODO this is ugly:
pbr.file().seek(SeekFrom::Start(pos1)).await?;
pbr.reset_io(pos1);
let mut i1 = 0;
'evread: loop {
match pbr.read_msg().await {
Ok(ei) => {
Ok(Some(ei)) => {
let ei = ei.item;
let tslast = if ei.len() > 0 { Some(ei.ts(ei.len() - 1)) } else { None };
i1 += 1;
if i1 % 1000 == 0 {
@@ -425,6 +447,10 @@ pub async fn make_single_event_pipe(
}
}
}
Ok(None) => {
info!("reached end of file");
break;
}
Err(e) => {
error!("error while reading msg {:?}", e);
break;
@@ -455,6 +481,177 @@ pub async fn make_single_event_pipe(
Ok(Box::pin(rx))
}
pub enum PositionState {
NothingFound,
Positioned,
}
pub struct PositionResult {
file: File,
state: PositionState,
}
async fn position_file_for_evq(mut file: File, evq: RawEventsQuery, year: u32) -> Result<PositionResult, Error> {
let flen = file.seek(SeekFrom::End(0)).await?;
file.seek(SeekFrom::Start(0)).await?;
if flen < 1024 * 512 {
position_file_for_evq_linear(file, evq, year).await
} else {
position_file_for_evq_binary(file, evq, year).await
}
}
async fn position_file_for_evq_linear(mut file: File, evq: RawEventsQuery, year: u32) -> Result<PositionResult, Error> {
let mut pbr = PbFileReader::new(file).await;
pbr.read_header().await?;
loop {
let res = pbr.read_msg().await?;
let res = if let Some(k) = res {
k
} else {
let ret = PositionResult {
file: pbr.into_file(),
state: PositionState::NothingFound,
};
return Ok(ret);
};
if res.item.len() < 1 {
return Err(Error::with_msg_no_trace("no event read from file"));
}
if res.item.ts(res.item.len() - 1) >= evq.range.beg {
let ret = PositionResult {
file: pbr.into_file(),
state: PositionState::Positioned,
};
return Ok(ret);
}
}
}
async fn position_file_for_evq_binary(mut file: File, evq: RawEventsQuery, year: u32) -> Result<PositionResult, Error> {
info!("position_file_for_evq_binary");
let flen = file.seek(SeekFrom::End(0)).await?;
file.seek(SeekFrom::Start(0)).await?;
let mut pbr = PbFileReader::new(file).await;
pbr.read_header().await?;
let payload_type = pbr.payload_type().clone();
let res = pbr.read_msg().await?;
let mut file = pbr.into_file();
let res = if let Some(res) = res {
res
} else {
return Err(Error::with_msg_no_trace("no event read from file"));
};
if res.item.len() < 1 {
return Err(Error::with_msg_no_trace("no event read from file"));
}
let events_begin_pos = res.pos;
// * the search invariant is that the ts1 < beg and ts2 >= end
// * read some data from the end.
// * read some data from the begin.
// * extract events from begin and end.
// * check if the binary search invariant is already violated, in that case return.
// * otherwise, choose some spot in the middle, read there the next chunk.
// Then use the actual position of the found item!
let mut buf1 = vec![0; 1024 * 16];
let mut buf2 = vec![0; 1024 * 16];
let mut buf3 = vec![0; 1024 * 16];
let mut p1 = events_begin_pos;
let mut p2 = flen - buf2.len() as u64;
file.seek(SeekFrom::Start(p1 - 1)).await?;
file.read_exact(&mut buf1).await?;
file.seek(SeekFrom::Start(p2)).await?;
file.read_exact(&mut buf2).await?;
let evs1 = parse_all_ts(p1 - 1, &buf1, payload_type.clone(), year)?;
let evs2 = parse_all_ts(p2, &buf2, payload_type.clone(), year)?;
info!("...............................................................");
info!("evs1: {:?}", evs1);
info!("evs2: {:?}", evs2);
info!("p1: {}", p1);
info!("p2: {}", p2);
let tgt = evq.range.beg;
{
let ev = evs1.first().unwrap();
if ev.ts >= tgt {
file.seek(SeekFrom::Start(ev.pos)).await?;
let ret = PositionResult {
state: PositionState::Positioned,
file,
};
return Ok(ret);
}
}
{
let ev = evs2.last().unwrap();
if ev.ts < tgt {
file.seek(SeekFrom::Start(0)).await?;
let ret = PositionResult {
state: PositionState::NothingFound,
file,
};
return Ok(ret);
}
}
p2 = evs2.last().unwrap().pos;
// TODO make sure that NL-delimited chunks have a max size.
loop {
info!("bsearch loop p1 {} p2 {}", p1, p2);
if p2 - p1 < 1024 * 128 {
// TODO switch here to linear search...
info!("switch to linear search in pos {}..{}", p1, p2);
return linear_search_2(file, evq, year, p1, p2, payload_type).await;
}
let p3 = (p2 + p1) / 2;
file.seek(SeekFrom::Start(p3)).await?;
file.read_exact(&mut buf3).await?;
let evs3 = parse_all_ts(p3, &buf3, payload_type.clone(), year)?;
let ev = evs3.first().unwrap();
if ev.ts < tgt {
info!("p3 {} ts: {} pos: {} branch A", p3, ev.ts, ev.pos);
p1 = ev.pos;
} else {
info!("p3 {} ts: {} pos: {} branch B", p3, ev.ts, ev.pos);
p2 = ev.pos;
}
}
}
async fn linear_search_2(
mut file: File,
evq: RawEventsQuery,
year: u32,
p1: u64,
p2: u64,
payload_type: PayloadType,
) -> Result<PositionResult, Error> {
eprintln!("linear_search_2");
file.seek(SeekFrom::Start(p1 - 1)).await?;
let mut buf = vec![0; (p2 - p1) as usize];
file.read_exact(&mut buf).await?;
let evs1 = parse_all_ts(p1 - 1, &buf, payload_type.clone(), year)?;
for ev in evs1 {
if ev.ts >= evq.range.beg {
info!("FOUND {:?}", ev);
file.seek(SeekFrom::Start(ev.pos)).await?;
let ret = PositionResult {
file,
state: PositionState::Positioned,
};
return Ok(ret);
}
}
Err(Error::with_msg_no_trace("linear_search_2 failed"))
}
#[allow(unused)]
fn events_item_to_framable(ei: EventsItem) -> Result<Box<dyn Framable + Send>, Error> {
match ei {
@@ -527,15 +724,19 @@ pub async fn channel_info(channel: &Channel, aa: &ArchiverAppliance) -> Result<C
msgs.push(format!("got header {}", pbr.channel_name()));
let ev = pbr.read_msg().await;
match ev {
Ok(item) => {
Ok(Some(item)) => {
let item = item.item;
msgs.push(format!("got event {:?}", item));
shape = Some(item.shape());
// These type mappings are defined by the protobuffer schema.
scalar_type = Some(item.scalar_type());
break;
}
Ok(None) => {
msgs.push(format!("can not read event"));
}
Err(e) => {
msgs.push(format!("can not read event! {:?}", e));
msgs.push(format!("can not read event {:?}", e));
}
}
msgs.push(format!("got header {}", pbr.channel_name()));

View File

@@ -1,3 +1,5 @@
pub mod multi;
use crate::events::parse_data_filename;
use crate::generated::EPICSEvent::PayloadType;
use crate::{unescape_archapp_msg, EventsItem, PlainEvents, ScalarPlainEvents, WavePlainEvents};
@@ -27,6 +29,7 @@ pub struct PbFileReader {
escbuf: Vec<u8>,
wp: usize,
rp: usize,
off: u64,
channel_name: String,
payload_type: PayloadType,
year: u32,
@@ -85,6 +88,11 @@ macro_rules! wave_parse {
const MIN_BUF_FILL: usize = 1024 * 64;
pub struct ReadMessageResult {
pub pos: u64,
pub item: EventsItem,
}
impl PbFileReader {
pub async fn new(file: File) -> Self {
Self {
@@ -93,12 +101,27 @@ impl PbFileReader {
escbuf: vec![],
wp: 0,
rp: 0,
off: 0,
channel_name: String::new(),
payload_type: PayloadType::V4_GENERIC_BYTES,
year: 0,
}
}
pub fn into_file(self) -> File {
self.file
}
pub fn file(&mut self) -> &mut File {
&mut self.file
}
pub fn reset_io(&mut self, off: u64) {
self.wp = 0;
self.rp = 0;
self.off = off;
}
pub async fn read_header(&mut self) -> Result<(), Error> {
self.fill_buf().await?;
let k = self.find_next_nl()?;
@@ -110,62 +133,73 @@ impl PbFileReader {
self.channel_name = payload_info.get_pvname().into();
self.payload_type = payload_info.get_field_type();
self.year = payload_info.get_year() as u32;
self.off += k as u64 + 1 - self.rp as u64;
self.rp = k + 1;
Ok(())
}
pub async fn read_msg(&mut self) -> Result<EventsItem, Error> {
pub async fn read_msg(&mut self) -> Result<Option<ReadMessageResult>, Error> {
self.fill_buf().await?;
let k = self.find_next_nl()?;
let k = if let Ok(k) = self.find_next_nl() {
k
} else {
return Ok(None);
};
let buf = &mut self.buf;
let m = mem::replace(&mut self.escbuf, vec![]);
let m = unescape_archapp_msg(&buf[self.rp..k], m)?;
self.escbuf = m;
let m = &self.escbuf;
let ei = Self::parse_buffer(&self.escbuf, self.payload_type.clone(), self.year)?;
let ret = ReadMessageResult {
pos: self.off,
item: ei,
};
self.off += k as u64 + 1 - self.rp as u64;
self.rp = k + 1;
Ok(Some(ret))
}
pub fn parse_buffer(m: &[u8], payload_type: PayloadType, year: u32) -> Result<EventsItem, Error> {
use PayloadType::*;
let ei = match self.payload_type {
SCALAR_BYTE => parse_scalar_byte(m, self.year)?,
let ei = match payload_type {
SCALAR_BYTE => parse_scalar_byte(m, year)?,
SCALAR_ENUM => {
scalar_parse!(m, self.year, ScalarEnum, Int, i32)
scalar_parse!(m, year, ScalarEnum, Int, i32)
}
SCALAR_SHORT => {
scalar_parse!(m, self.year, ScalarShort, Short, i16)
scalar_parse!(m, year, ScalarShort, Short, i16)
}
SCALAR_INT => {
scalar_parse!(m, self.year, ScalarInt, Int, i32)
scalar_parse!(m, year, ScalarInt, Int, i32)
}
SCALAR_FLOAT => {
scalar_parse!(m, self.year, ScalarFloat, Float, f32)
scalar_parse!(m, year, ScalarFloat, Float, f32)
}
SCALAR_DOUBLE => {
scalar_parse!(m, self.year, ScalarDouble, Double, f64)
scalar_parse!(m, year, ScalarDouble, Double, f64)
}
WAVEFORM_BYTE => {
wave_parse!(m, self.year, VectorChar, Byte, i8)
wave_parse!(m, year, VectorChar, Byte, i8)
}
WAVEFORM_SHORT => {
wave_parse!(m, self.year, VectorShort, Short, i16)
wave_parse!(m, year, VectorShort, Short, i16)
}
WAVEFORM_ENUM => {
wave_parse!(m, self.year, VectorEnum, Int, i32)
wave_parse!(m, year, VectorEnum, Int, i32)
}
WAVEFORM_INT => {
wave_parse!(m, self.year, VectorInt, Int, i32)
wave_parse!(m, year, VectorInt, Int, i32)
}
WAVEFORM_FLOAT => {
wave_parse!(m, self.year, VectorFloat, Float, f32)
wave_parse!(m, year, VectorFloat, Float, f32)
}
WAVEFORM_DOUBLE => {
wave_parse!(m, self.year, VectorDouble, Double, f64)
wave_parse!(m, year, VectorDouble, Double, f64)
}
SCALAR_STRING | WAVEFORM_STRING | V4_GENERIC_BYTES => {
return Err(Error::with_msg_no_trace(format!(
"not supported: {:?}",
self.payload_type
)));
return Err(Error::with_msg_no_trace(format!("not supported: {:?}", payload_type)));
}
};
self.rp = k + 1;
Ok(ei)
}
@@ -176,9 +210,6 @@ impl PbFileReader {
if self.rp + MIN_BUF_FILL >= self.buf.len() {
let n = self.wp - self.rp;
self.buf.copy_within(self.rp..self.rp + n, 0);
//for i in 0..n {
// self.buf[i] = self.buf[self.rp + i];
//}
self.rp = 0;
self.wp = n;
}
@@ -205,6 +236,7 @@ impl PbFileReader {
k += 1;
}
if k == self.wp {
// TODO test whether with_msg_no_trace makes difference.
return Err(Error::with_msg("no nl in pb file"));
}
Ok(k)
@@ -442,7 +474,8 @@ pub async fn scan_files_inner(
if false {
dbconn::insert_channel(channel_path.into(), ndi.facility, &dbc).await?;
}
if let Ok(msg) = pbr.read_msg().await {
if let Ok(Some(msg)) = pbr.read_msg().await {
let msg = msg.item;
lru.insert(channel_path);
{
tx.send(Ok(Box::new(serde_json::to_value(format!(
@@ -451,10 +484,6 @@ pub async fn scan_files_inner(
msg.variant_name()
))?) as ItemSerBox))
.await?;
/*waves_found += 1;
if waves_found >= 20 {
break;
}*/
}
}
}

View File

@@ -0,0 +1,50 @@
use crate::generated::EPICSEvent::PayloadType;
use crate::parse::PbFileReader;
use err::Error;
use items::{WithLen, WithTimestamps};
#[derive(Debug)]
pub struct PosTs {
pub pos: u64,
pub ts: u64,
}
pub fn parse_all_ts(off: u64, buf: &[u8], payload_type: PayloadType, year: u32) -> Result<Vec<PosTs>, Error> {
let mut ret = vec![];
let mut i1 = 0;
let mut i2 = usize::MAX;
loop {
if i1 >= buf.len() {
break;
}
if buf[i1] == 10 {
if i2 == usize::MAX {
i2 = i1;
} else {
// Have a chunk from i2..i1
match PbFileReader::parse_buffer(&buf[i2 + 1..i1], payload_type.clone(), year) {
Ok(k) => {
if k.len() != 1 {
return Err(Error::with_msg_no_trace(format!(
"parsed buffer contained {} events",
k.len()
)));
} else {
let h = PosTs {
pos: off + i2 as u64 + 1,
ts: k.ts(0),
};
ret.push(h);
}
}
Err(e) => {
// TODO ignore except if it's the last chunk.
}
}
i2 = i1;
}
}
i1 += 1;
}
Ok(ret)
}