RawSubFile support multi file access (#173)

This PR is a fix/improvement to a problem that Jonathan had. (#156) The
original implementation opened all subfiles at once witch works for
normal sized datasets but fails at a certain point (thousands of files).

- This solution uses RawSubFile to manage the different file indicies
and only opens the file we need
- Added logger.h from slsDetectorPackage for debug printing (in
production no messages should be visible)
This commit is contained in:
Erik Fröjdh
2025-05-22 11:00:03 +02:00
committed by GitHub
parent a6eebbe9bd
commit 9e1b8731b0
16 changed files with 517 additions and 216 deletions

View File

@@ -1,6 +1,8 @@
#include "aare/RawFile.hpp"
#include "aare/algorithm.hpp"
#include "aare/PixelMap.hpp"
#include "aare/defs.hpp"
#include "aare/logger.hpp"
#include "aare/geo_helpers.hpp"
#include <fmt/format.h>
@@ -14,23 +16,14 @@ RawFile::RawFile(const std::filesystem::path &fname, const std::string &mode)
: m_master(fname) {
m_mode = mode;
if (mode == "r") {
n_subfiles = find_number_of_subfiles(); // f0,f1...fn
n_subfile_parts =
m_master.geometry().col * m_master.geometry().row; // d0,d1...dn
find_geometry();
if (m_master.roi()){
m_geometry = update_geometry_with_roi(m_geometry, m_master.roi().value());
}
open_subfiles();
} else {
throw std::runtime_error(LOCATION +
"Unsupported mode. Can only read RawFiles.");
" Unsupported mode. Can only read RawFiles.");
}
}
@@ -67,12 +60,12 @@ void RawFile::read_into(std::byte *image_buf, size_t n_frames, DetectorHeader *h
this->get_frame_into(m_current_frame++, image_buf, header);
image_buf += bytes_per_frame();
if(header)
header+=n_mod();
header+=n_modules();
}
}
size_t RawFile::n_mod() const { return n_subfile_parts; }
size_t RawFile::n_modules() const { return m_master.n_modules(); }
size_t RawFile::bytes_per_frame() {
@@ -106,17 +99,11 @@ xy RawFile::geometry() { return m_master.geometry(); }
void RawFile::open_subfiles() {
if (m_mode == "r")
for (size_t i = 0; i != n_subfiles; ++i) {
auto v = std::vector<RawSubFile *>(n_subfile_parts);
for (size_t j = 0; j != n_subfile_parts; ++j) {
auto pos = m_geometry.module_pixel_0[j];
v[j] = new RawSubFile(m_master.data_fname(j, i),
m_master.detector_type(), pos.height,
pos.width, m_master.bitdepth(),
pos.row_index, pos.col_index);
}
subfiles.push_back(v);
for (size_t i = 0; i != n_modules(); ++i) {
auto pos = m_geometry.module_pixel_0[i];
m_subfiles.emplace_back(std::make_unique<RawSubFile>(
m_master.data_fname(i, 0), m_master.detector_type(), pos.height,
pos.width, m_master.bitdepth(), pos.row_index, pos.col_index));
}
else {
throw std::runtime_error(LOCATION +
@@ -141,18 +128,6 @@ DetectorHeader RawFile::read_header(const std::filesystem::path &fname) {
return h;
}
int RawFile::find_number_of_subfiles() {
int n_files = 0;
// f0,f1...fn How many files is the data split into?
while (std::filesystem::exists(m_master.data_fname(0, n_files)))
n_files++; // increment after test
#ifdef AARE_VERBOSE
fmt::print("Found: {} subfiles\n", n_files);
#endif
return n_files;
}
RawMasterFile RawFile::master() const { return m_master; }
@@ -168,7 +143,7 @@ void RawFile::find_geometry() {
uint16_t c{};
for (size_t i = 0; i < n_subfile_parts; i++) {
for (size_t i = 0; i < n_modules(); i++) {
auto h = read_header(m_master.data_fname(i, 0));
r = std::max(r, h.row);
c = std::max(c, h.column);
@@ -210,70 +185,58 @@ size_t RawFile::bytes_per_pixel() const {
}
void RawFile::get_frame_into(size_t frame_index, std::byte *frame_buffer, DetectorHeader *header) {
LOG(logDEBUG) << "RawFile::get_frame_into(" << frame_index << ")";
if (frame_index >= total_frames()) {
throw std::runtime_error(LOCATION + "Frame number out of range");
}
std::vector<size_t> frame_numbers(n_subfile_parts);
std::vector<size_t> frame_indices(n_subfile_parts, frame_index);
std::vector<size_t> frame_numbers(n_modules());
std::vector<size_t> frame_indices(n_modules(), frame_index);
// sync the frame numbers
if (n_subfile_parts != 1) {
for (size_t part_idx = 0; part_idx != n_subfile_parts; ++part_idx) {
auto subfile_id = frame_index / m_master.max_frames_per_file();
if (subfile_id >= subfiles.size()) {
throw std::runtime_error(LOCATION +
" Subfile out of range. Possible missing data.");
}
frame_numbers[part_idx] =
subfiles[subfile_id][part_idx]->frame_number(
frame_index % m_master.max_frames_per_file());
if (n_modules() != 1) { //if we have more than one module
for (size_t part_idx = 0; part_idx != n_modules(); ++part_idx) {
frame_numbers[part_idx] = m_subfiles[part_idx]->frame_number(frame_index);
}
// 1. if frame number vector is the same break
while (std::adjacent_find(frame_numbers.begin(), frame_numbers.end(),
std::not_equal_to<>()) !=
frame_numbers.end()) {
while (!all_equal(frame_numbers)) {
// 2. find the index of the minimum frame number,
auto min_frame_idx = std::distance(
frame_numbers.begin(),
std::min_element(frame_numbers.begin(), frame_numbers.end()));
// 3. increase its index and update its respective frame number
frame_indices[min_frame_idx]++;
// 4. if we can't increase its index => throw error
if (frame_indices[min_frame_idx] >= total_frames()) {
throw std::runtime_error(LOCATION +
"Frame number out of range");
}
auto subfile_id =
frame_indices[min_frame_idx] / m_master.max_frames_per_file();
frame_numbers[min_frame_idx] =
subfiles[subfile_id][min_frame_idx]->frame_number(
frame_indices[min_frame_idx] %
m_master.max_frames_per_file());
m_subfiles[min_frame_idx]->frame_number(frame_indices[min_frame_idx]);
}
}
if (m_master.geometry().col == 1) {
// get the part from each subfile and copy it to the frame
for (size_t part_idx = 0; part_idx != n_subfile_parts; ++part_idx) {
for (size_t part_idx = 0; part_idx != n_modules(); ++part_idx) {
auto corrected_idx = frame_indices[part_idx];
auto subfile_id = corrected_idx / m_master.max_frames_per_file();
if (subfile_id >= subfiles.size()) {
throw std::runtime_error(LOCATION +
" Subfile out of range. Possible missing data.");
}
// This is where we start writing
auto offset = (m_geometry.module_pixel_0[part_idx].origin_y * m_geometry.pixels_x +
m_geometry.module_pixel_0[part_idx].origin_x)*m_master.bitdepth()/8;
if (m_geometry.module_pixel_0[part_idx].origin_x!=0)
throw std::runtime_error(LOCATION + "Implementation error. x pos not 0.");
throw std::runtime_error(LOCATION + " Implementation error. x pos not 0.");
//TODO! Risk for out of range access
subfiles[subfile_id][part_idx]->seek(corrected_idx % m_master.max_frames_per_file());
subfiles[subfile_id][part_idx]->read_into(frame_buffer + offset, header);
//TODO! What if the files don't match?
m_subfiles[part_idx]->seek(corrected_idx);
m_subfiles[part_idx]->read_into(frame_buffer + offset, header);
if (header)
++header;
}
@@ -282,26 +245,21 @@ void RawFile::get_frame_into(size_t frame_index, std::byte *frame_buffer, Detect
//TODO! should we read row by row?
// create a buffer large enough to hold a full module
auto bytes_per_part = m_master.pixels_y() * m_master.pixels_x() *
m_master.bitdepth() /
8; // TODO! replace with image_size_in_bytes
auto *part_buffer = new std::byte[bytes_per_part];
// TODO! if we have many submodules we should reorder them on the module
// level
for (size_t part_idx = 0; part_idx != n_subfile_parts; ++part_idx) {
for (size_t part_idx = 0; part_idx != n_modules(); ++part_idx) {
auto pos = m_geometry.module_pixel_0[part_idx];
auto corrected_idx = frame_indices[part_idx];
auto subfile_id = corrected_idx / m_master.max_frames_per_file();
if (subfile_id >= subfiles.size()) {
throw std::runtime_error(LOCATION +
" Subfile out of range. Possible missing data.");
}
subfiles[subfile_id][part_idx]->seek(corrected_idx % m_master.max_frames_per_file());
subfiles[subfile_id][part_idx]->read_into(part_buffer, header);
m_subfiles[part_idx]->seek(corrected_idx);
m_subfiles[part_idx]->read_into(part_buffer, header);
if(header)
++header;
@@ -321,6 +279,7 @@ void RawFile::get_frame_into(size_t frame_index, std::byte *frame_buffer, Detect
}
delete[] part_buffer;
}
}
std::vector<Frame> RawFile::read_n(size_t n_frames) {
@@ -337,27 +296,8 @@ size_t RawFile::frame_number(size_t frame_index) {
if (frame_index >= m_master.frames_in_file()) {
throw std::runtime_error(LOCATION + " Frame number out of range");
}
size_t subfile_id = frame_index / m_master.max_frames_per_file();
if (subfile_id >= subfiles.size()) {
throw std::runtime_error(
LOCATION + " Subfile out of range. Possible missing data.");
}
return subfiles[subfile_id][0]->frame_number(
frame_index % m_master.max_frames_per_file());
return m_subfiles[0]->frame_number(frame_index);
}
RawFile::~RawFile() {
// TODO! Fix this, for file closing
for (auto &vec : subfiles) {
for (auto *subfile : vec) {
delete subfile;
}
}
}
} // namespace aare

View File

@@ -99,11 +99,11 @@ TEST_CASE("Read frame numbers from a raw file", "[.integration]") {
}
}
TEST_CASE("Compare reading from a numpy file with a raw file", "[.integration]") {
auto fpath_raw = test_data_path() / "jungfrau" / "jungfrau_single_master_0.json";
TEST_CASE("Compare reading from a numpy file with a raw file", "[.files]") {
auto fpath_raw = test_data_path() / "raw/jungfrau" / "jungfrau_single_master_0.json";
REQUIRE(std::filesystem::exists(fpath_raw));
auto fpath_npy = test_data_path() / "jungfrau" / "jungfrau_single_0.npy";
auto fpath_npy = test_data_path() / "raw/jungfrau" / "jungfrau_single_0.npy";
REQUIRE(std::filesystem::exists(fpath_npy));
File raw(fpath_raw, "r");
@@ -113,6 +113,7 @@ TEST_CASE("Compare reading from a numpy file with a raw file", "[.integration]")
CHECK(npy.total_frames() == 10);
for (size_t i = 0; i < 10; ++i) {
CHECK(raw.tell() == i);
auto raw_frame = raw.read_frame();
auto npy_frame = npy.read_frame();
CHECK((raw_frame.view<uint16_t>() == npy_frame.view<uint16_t>()));

View File

@@ -140,6 +140,10 @@ std::optional<size_t> RawMasterFile::number_of_rows() const {
xy RawMasterFile::geometry() const { return m_geometry; }
size_t RawMasterFile::n_modules() const {
return m_geometry.row * m_geometry.col;
}
std::optional<uint8_t> RawMasterFile::quad() const { return m_quad; }
// optional values, these may or may not be present in the master file

View File

@@ -1,10 +1,14 @@
#include "aare/RawSubFile.hpp"
#include "aare/PixelMap.hpp"
#include "aare/algorithm.hpp"
#include "aare/utils/ifstream_helpers.hpp"
#include "aare/logger.hpp"
#include <cstring> // memcpy
#include <fmt/core.h>
#include <iostream>
#include <regex>
namespace aare {
@@ -12,51 +16,51 @@ namespace aare {
RawSubFile::RawSubFile(const std::filesystem::path &fname,
DetectorType detector, size_t rows, size_t cols,
size_t bitdepth, uint32_t pos_row, uint32_t pos_col)
: m_detector_type(detector), m_bitdepth(bitdepth), m_fname(fname),
: m_detector_type(detector), m_bitdepth(bitdepth),
m_rows(rows), m_cols(cols),
m_bytes_per_frame((m_bitdepth / 8) * m_rows * m_cols), m_pos_row(pos_row),
m_pos_col(pos_col) {
LOG(logDEBUG) << "RawSubFile::RawSubFile()";
if (m_detector_type == DetectorType::Moench03_old) {
m_pixel_map = GenerateMoench03PixelMap();
} else if (m_detector_type == DetectorType::Eiger && m_pos_row % 2 == 0) {
m_pixel_map = GenerateEigerFlipRowsPixelMap();
}
if (std::filesystem::exists(fname)) {
m_num_frames = std::filesystem::file_size(fname) /
(sizeof(DetectorHeader) + rows * cols * bitdepth / 8);
} else {
throw std::runtime_error(
LOCATION + fmt::format("File {} does not exist", m_fname.string()));
}
// fp = fopen(m_fname.string().c_str(), "rb");
m_file.open(m_fname, std::ios::binary);
if (!m_file.is_open()) {
throw std::runtime_error(
LOCATION + fmt::format("Could not open file {}", m_fname.string()));
}
#ifdef AARE_VERBOSE
fmt::print("Opened file: {} with {} frames\n", m_fname.string(), m_num_frames);
fmt::print("m_rows: {}, m_cols: {}, m_bitdepth: {}\n", m_rows, m_cols,
m_bitdepth);
fmt::print("file size: {}\n", std::filesystem::file_size(fname));
#endif
parse_fname(fname);
scan_files();
open_file(m_current_file_index); // open the first file
}
void RawSubFile::seek(size_t frame_index) {
if (frame_index >= m_num_frames) {
throw std::runtime_error(LOCATION + fmt::format("Frame index {} out of range in a file with {} frames", frame_index, m_num_frames));
LOG(logDEBUG) << "RawSubFile::seek(" << frame_index << ")";
if (frame_index >= m_total_frames) {
throw std::runtime_error(LOCATION + " Frame index out of range: " +
std::to_string(frame_index));
}
m_file.seekg((sizeof(DetectorHeader) + bytes_per_frame()) * frame_index);
m_current_frame_index = frame_index;
auto file_index = first_larger(m_last_frame_in_file, frame_index);
if (file_index != m_current_file_index)
open_file(file_index);
auto frame_offset = (file_index)
? frame_index - m_last_frame_in_file[file_index - 1]
: frame_index;
auto byte_offset = frame_offset * (m_bytes_per_frame + sizeof(DetectorHeader));
m_file.seekg(byte_offset);
}
size_t RawSubFile::tell() {
return m_file.tellg() / (sizeof(DetectorHeader) + bytes_per_frame());
LOG(logDEBUG) << "RawSubFile::tell():" << m_current_frame_index;
return m_current_frame_index;
}
void RawSubFile::read_into(std::byte *image_buf, DetectorHeader *header) {
LOG(logDEBUG) << "RawSubFile::read_into()";
if (header) {
m_file.read(reinterpret_cast<char *>(header), sizeof(DetectorHeader));
} else {
@@ -90,6 +94,13 @@ void RawSubFile::read_into(std::byte *image_buf, DetectorHeader *header) {
if (m_file.fail()){
throw std::runtime_error(LOCATION + ifstream_error_msg(m_file));
}
++ m_current_frame_index;
if (m_current_frame_index >= m_last_frame_in_file[m_current_file_index] &&
(m_current_frame_index < m_total_frames)) {
++m_current_file_index;
open_file(m_current_file_index);
}
}
void RawSubFile::read_into(std::byte *image_buf, size_t n_frames, DetectorHeader *header) {
@@ -130,4 +141,69 @@ size_t RawSubFile::frame_number(size_t frame_index) {
return h.frameNumber;
}
void RawSubFile::parse_fname(const std::filesystem::path &fname) {
LOG(logDEBUG) << "RawSubFile::parse_fname()";
// data has the format: /path/too/data/jungfrau_single_d0_f1_0.raw
// d0 is the module index, will not change for this file
// f1 is the file index - thi is the one we need
// 0 is the measurement index, will not change
m_path = fname.parent_path();
m_base_name = fname.filename();
// Regex to extract numbers after 'd' and 'f'
std::regex pattern(R"(^(.*_d)(\d+)(_f)(\d+)(_\d+\.raw)$)");
std::smatch match;
if (std::regex_match(m_base_name, match, pattern)) {
m_offset = std::stoi(match[4].str()); // find the first file index in case of a truncated series
m_base_name = match[1].str() + match[2].str() + match[3].str() + "{}" + match[5].str();
LOG(logDEBUG) << "Base name: " << m_base_name;
LOG(logDEBUG) << "Offset: " << m_offset;
LOG(logDEBUG) << "Path: " << m_path.string();
} else {
throw std::runtime_error(
LOCATION + fmt::format("Could not parse file name {}", fname.string()));
}
}
std::filesystem::path RawSubFile::fpath(size_t file_index) const {
auto fname = fmt::format(m_base_name, file_index);
return m_path / fname;
}
void RawSubFile::open_file(size_t file_index) {
m_file.close();
auto fname = fpath(file_index+m_offset);
LOG(logDEBUG) << "RawSubFile::open_file(): " << fname.string();
m_file.open(fname, std::ios::binary);
if (!m_file.is_open()) {
throw std::runtime_error(
LOCATION + fmt::format("Could not open file {}", fpath(file_index).string()));
}
m_current_file_index = file_index;
}
void RawSubFile::scan_files() {
LOG(logDEBUG) << "RawSubFile::scan_files()";
// find how many files we have and the number of frames in each file
m_last_frame_in_file.clear();
size_t file_index = m_offset;
while (std::filesystem::exists(fpath(file_index))) {
auto n_frames = std::filesystem::file_size(fpath(file_index)) /
(m_bytes_per_frame + sizeof(DetectorHeader));
m_last_frame_in_file.push_back(n_frames);
LOG(logDEBUG) << "Found: " << n_frames << " frames in file: " << fpath(file_index).string();
++file_index;
}
// find where we need to open the next file and total number of frames
m_last_frame_in_file = cumsum(m_last_frame_in_file);
if(m_last_frame_in_file.empty()){
m_total_frames = 0;
}else{
m_total_frames = m_last_frame_in_file.back();
}
}
} // namespace aare

76
src/RawSubFile.test.cpp Normal file
View File

@@ -0,0 +1,76 @@
#include "aare/RawSubFile.hpp"
#include "aare/File.hpp"
#include "aare/NDArray.hpp"
#include <catch2/catch_test_macros.hpp>
#include "test_config.hpp"
using namespace aare;
TEST_CASE("Read frames directly from a RawSubFile", "[.files]"){
auto fpath_raw = test_data_path() / "raw/jungfrau" / "jungfrau_single_d0_f0_0.raw";
REQUIRE(std::filesystem::exists(fpath_raw));
RawSubFile f(fpath_raw, DetectorType::Jungfrau, 512, 1024, 16);
REQUIRE(f.rows() == 512);
REQUIRE(f.cols() == 1024);
REQUIRE(f.pixels_per_frame() == 512 * 1024);
REQUIRE(f.bytes_per_frame() == 512 * 1024 * 2);
REQUIRE(f.bytes_per_pixel() == 2);
auto fpath_npy = test_data_path() / "raw/jungfrau" / "jungfrau_single_0.npy";
REQUIRE(std::filesystem::exists(fpath_npy));
//Numpy file with the same data to use as reference
File npy(fpath_npy, "r");
CHECK(f.frames_in_file() == 10);
CHECK(npy.total_frames() == 10);
DetectorHeader header{};
NDArray<uint16_t, 2> image({static_cast<ssize_t>(f.rows()), static_cast<ssize_t>(f.cols())});
for (size_t i = 0; i < 10; ++i) {
CHECK(f.tell() == i);
f.read_into(image.buffer(), &header);
auto npy_frame = npy.read_frame();
CHECK((image.view() == npy_frame.view<uint16_t>()));
}
}
TEST_CASE("Read frames directly from a RawSubFile starting at the second file", "[.files]"){
// we know this file has 10 frames with frame numbers 1 to 10
// f0 1,2,3
// f1 4,5,6 <-- starting here
// f2 7,8,9
// f3 10
auto fpath_raw = test_data_path() / "raw/jungfrau" / "jungfrau_single_d0_f1_0.raw";
REQUIRE(std::filesystem::exists(fpath_raw));
RawSubFile f(fpath_raw, DetectorType::Jungfrau, 512, 1024, 16);
auto fpath_npy = test_data_path() / "raw/jungfrau" / "jungfrau_single_0.npy";
REQUIRE(std::filesystem::exists(fpath_npy));
//Numpy file with the same data to use as reference
File npy(fpath_npy, "r");
npy.seek(3);
CHECK(f.frames_in_file() == 7);
CHECK(npy.total_frames() == 10);
DetectorHeader header{};
NDArray<uint16_t, 2> image({static_cast<ssize_t>(f.rows()), static_cast<ssize_t>(f.cols())});
for (size_t i = 0; i < 7; ++i) {
CHECK(f.tell() == i);
f.read_into(image.buffer(), &header);
// frame numbers start at 1 frame index at 0
// adding 3 + 1 to verify the frame number
CHECK(header.frameNumber == i + 4);
auto npy_frame = npy.read_frame();
CHECK((image.view() == npy_frame.view<uint16_t>()));
}
}

View File

@@ -160,3 +160,36 @@ TEST_CASE("cumsum works with negative numbers", "[algorithm]") {
REQUIRE(result[3] == -6);
REQUIRE(result[4] == -10);
}
TEST_CASE("cumsum on an empty vector", "[algorithm]") {
std::vector<double> vec = {};
auto result = aare::cumsum(vec);
REQUIRE(result.size() == 0);
}
TEST_CASE("All equal on an empty vector is false", "[algorithm]") {
std::vector<int> vec = {};
REQUIRE(aare::all_equal(vec) == false);
}
TEST_CASE("All equal on a vector with 1 element is true", "[algorithm]") {
std::vector<int> vec = {1};
REQUIRE(aare::all_equal(vec) == true);
}
TEST_CASE("All equal on a vector with 2 elements is true", "[algorithm]") {
std::vector<int> vec = {1, 1};
REQUIRE(aare::all_equal(vec) == true);
}
TEST_CASE("All equal on a vector with two different elements is false", "[algorithm]") {
std::vector<int> vec = {1, 2};
REQUIRE(aare::all_equal(vec) == false);
}
TEST_CASE("Last element is different", "[algorithm]") {
std::vector<int> vec = {1, 1, 1, 1, 2};
REQUIRE(aare::all_equal(vec) == false);
}