mirror of
https://github.com/slsdetectorgroup/aare.git
synced 2026-06-04 14:58:46 +02:00
row bug and memcpy
This commit is contained in:
@@ -31,7 +31,12 @@ class PixelHistogram {
|
||||
const AxisType xmin_;
|
||||
const AxisType xmax_;
|
||||
std::vector<Hist> partial_hists_;
|
||||
|
||||
// Cumulative row offsets so that thread t owns rows
|
||||
// [row_offsets_[t], row_offsets_[t + 1])
|
||||
// Length is n_threads_ + 1; partition is balanced (the first
|
||||
// rows_ % n_threads_ threads get one extra row each).
|
||||
std::vector<int> row_offsets_;
|
||||
|
||||
// Thread pool members
|
||||
std::vector<std::thread> workers_;
|
||||
std::mutex work_mutex_;
|
||||
@@ -41,7 +46,7 @@ class PixelHistogram {
|
||||
std::atomic<int> completed_threads_;
|
||||
std::atomic<bool> stop_workers_;
|
||||
int work_generation_;
|
||||
|
||||
|
||||
// Private worker thread method
|
||||
void worker_loop(int thread_id);
|
||||
int row_start(int thread_id) const;
|
||||
|
||||
+31
-20
@@ -23,7 +23,23 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub
|
||||
|
||||
n_threads_ = std::min(n_threads_, rows_);
|
||||
|
||||
// Build a balanced row partition. With base = rows_ / n_threads_ and
|
||||
// extra = rows_ % n_threads_, the first `extra` threads get base + 1
|
||||
// rows each and the rest get `base` rows. This avoids the
|
||||
// ceil(rows/n_threads) scheme leaving trailing threads with zero or
|
||||
// negative row counts (e.g. rows=17, n_threads=8).
|
||||
row_offsets_.resize(n_threads_ + 1);
|
||||
const int base = rows_ / n_threads_;
|
||||
const int extra = rows_ % n_threads_;
|
||||
int offset = 0;
|
||||
for (int i = 0; i < n_threads_; ++i) {
|
||||
row_offsets_[i] = offset;
|
||||
offset += base + (i < extra ? 1 : 0);
|
||||
}
|
||||
row_offsets_[n_threads_] = offset; // == rows_ by construction
|
||||
|
||||
// Initialize partial histograms for each thread
|
||||
partial_hists_.reserve(n_threads_);
|
||||
for (int i = 0; i < n_threads_; ++i) {
|
||||
const auto local_rows = row_count(i);
|
||||
partial_hists_.emplace_back(
|
||||
@@ -33,7 +49,7 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub
|
||||
bh::axis::integer<int, bh::use_default, bh::axis::option::none_t>(0, local_rows, "x")
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
// Spawn worker threads
|
||||
for (int i = 0; i < n_threads_; ++i) {
|
||||
workers_.emplace_back([this, i]() { this->worker_loop(i); });
|
||||
@@ -57,14 +73,11 @@ PixelHistogram::~PixelHistogram() {
|
||||
}
|
||||
|
||||
int PixelHistogram::row_start(int thread_id) const {
|
||||
const int rows_per_thread = (rows_ + n_threads_ - 1) / n_threads_;
|
||||
return thread_id * rows_per_thread;
|
||||
return row_offsets_[thread_id];
|
||||
}
|
||||
|
||||
int PixelHistogram::row_count(int thread_id) const {
|
||||
const int start = row_start(thread_id);
|
||||
const int end = std::min(start + (rows_ + n_threads_ - 1) / n_threads_, rows_);
|
||||
return end - start;
|
||||
return row_offsets_[thread_id + 1] - row_offsets_[thread_id];
|
||||
}
|
||||
|
||||
void PixelHistogram::worker_loop(int thread_id) {
|
||||
@@ -120,24 +133,22 @@ NDArray<PixelHistogram::StorageType, 3> PixelHistogram::hdata() const {
|
||||
|
||||
NDArray<StorageType, 3> data({rows, cols, bins});
|
||||
|
||||
// Merge all partial histograms into data array
|
||||
std::memset(data.data(), 0, data.total_bytes());
|
||||
|
||||
// Each thread owns a disjoint, contiguous range of rows and its dense
|
||||
// storage layout [local_row][col][bin] is identical to the slice
|
||||
// [first_row .. first_row + local_rows)[col][bin] of `data`. So the
|
||||
// merge is just one bulk copy per thread; no per-element accumulation
|
||||
// and no upfront zeroing of `data` is needed.
|
||||
const size_t pixel_stride = static_cast<size_t>(cols) * bins;
|
||||
for (int t = 0; t < n_threads_; ++t) {
|
||||
const auto &storage = bh::unsafe_access::storage(partial_hists_[t]);
|
||||
const StorageType *partial_data = storage.data();
|
||||
const auto first_row = static_cast<ssize_t>(row_start(t));
|
||||
const auto local_rows = static_cast<ssize_t>(row_count(t));
|
||||
const auto first_row = static_cast<size_t>(row_start(t));
|
||||
const auto local_rows = static_cast<size_t>(row_count(t));
|
||||
if (local_rows == 0) continue;
|
||||
|
||||
for (ssize_t local_row = 0; local_row < local_rows; ++local_row) {
|
||||
for (ssize_t col = 0; col < cols; ++col) {
|
||||
const auto src_offset = (local_row * cols + col) * bins;
|
||||
const auto dst_offset = ((first_row + local_row) * cols + col) * bins;
|
||||
for (ssize_t bin = 0; bin < bins; ++bin) {
|
||||
data.data()[dst_offset + bin] += partial_data[src_offset + bin];
|
||||
}
|
||||
}
|
||||
}
|
||||
std::memcpy(data.data() + first_row * pixel_stride,
|
||||
partial_data,
|
||||
local_rows * pixel_stride * sizeof(StorageType));
|
||||
}
|
||||
|
||||
return data;
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <catch2/generators/catch_generators.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include "test_config.hpp"
|
||||
#include "test_macros.hpp"
|
||||
@@ -77,3 +80,129 @@ TEST_CASE("Fill pixels with uneven partial histogram row slices"){
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Row partitioning handles rows < n_threads * ceil(rows/n_threads)") {
|
||||
// Regression test for the pre-existing row partitioning bug: with the
|
||||
// old ceil(rows / n_threads) scheme, rows=17 and n_threads=8 left
|
||||
// trailing threads with negative row counts and threw
|
||||
// "stop >= start required" from boost::histogram during construction.
|
||||
// The current scheme distributes the remainder so every thread gets
|
||||
// at least floor(rows / n_threads) rows. We also exercise the case
|
||||
// where n_threads > rows, which is clamped down to rows.
|
||||
const auto p = GENERATE(table<int, int>({
|
||||
{17, 8}, // previously broken: 8 threads * ceil(17/8) = 24 > 17
|
||||
{17, 5}, // previously broken: 5 threads * ceil(17/5) = 20 > 17
|
||||
{13, 4}, // previously broken: 4 * ceil(13/4) = 16 > 13
|
||||
{17, 32}, // n_threads > rows -> clamped to rows
|
||||
{1, 8}, // n_threads > rows -> single thread, single row
|
||||
{6, 6}, // balanced
|
||||
}));
|
||||
const int rows = std::get<0>(p);
|
||||
const int n_threads = std::get<1>(p);
|
||||
const int cols = 3;
|
||||
const int n_bins = 4;
|
||||
constexpr float xmin = 0.0f;
|
||||
constexpr float xmax = 4.0f;
|
||||
|
||||
PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads);
|
||||
|
||||
// Put one in-range value per row so we can verify every row is covered.
|
||||
NDArray<float, 2> img({rows, cols}, -1.0f); // -1 is out of range
|
||||
for (ssize_t r = 0; r < rows; ++r) {
|
||||
img(r, 0) = static_cast<float>(r % n_bins) + 0.5f;
|
||||
}
|
||||
hist.fill(img.view());
|
||||
|
||||
auto h = hist.hdata();
|
||||
REQUIRE(h.shape(0) == rows);
|
||||
REQUIRE(h.shape(1) == cols);
|
||||
REQUIRE(h.shape(2) == n_bins);
|
||||
|
||||
for (ssize_t r = 0; r < rows; ++r) {
|
||||
const int expected_bin = static_cast<int>(r % n_bins);
|
||||
for (ssize_t c = 0; c < cols; ++c) {
|
||||
for (ssize_t b = 0; b < n_bins; ++b) {
|
||||
const uint16_t want =
|
||||
(c == 0 && b == expected_bin) ? uint16_t{1} : uint16_t{0};
|
||||
if (h(r, c, b) != want) {
|
||||
INFO("rows=" << rows << " n_threads=" << n_threads
|
||||
<< " r=" << r << " c=" << c << " b=" << b
|
||||
<< " got=" << h(r, c, b) << " want=" << want);
|
||||
CHECK(h(r, c, b) == want);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Random fills match a reference implementation") {
|
||||
// End-to-end correctness check: compare the implementation against a
|
||||
// simple per-pixel reference for several thread counts. Values are
|
||||
// sampled slightly wider than [xmin, xmax) so the out-of-range filter
|
||||
// is also exercised.
|
||||
constexpr int rows = 17;
|
||||
constexpr int cols = 23;
|
||||
constexpr int n_bins = 32;
|
||||
constexpr float xmin = -1.5f;
|
||||
constexpr float xmax = 4.5f;
|
||||
const int n_threads = GENERATE(1, 2, 4, 8);
|
||||
|
||||
PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads);
|
||||
|
||||
std::mt19937 rng(0xC0FFEE);
|
||||
std::uniform_real_distribution<float> dist(xmin - 0.5f, xmax + 0.5f);
|
||||
|
||||
constexpr int n_frames = 4;
|
||||
std::vector<NDArray<float, 2>> frames;
|
||||
frames.reserve(n_frames);
|
||||
for (int f = 0; f < n_frames; ++f) {
|
||||
NDArray<float, 2> img({rows, cols}, 0.0f);
|
||||
for (ssize_t r = 0; r < rows; ++r) {
|
||||
for (ssize_t c = 0; c < cols; ++c) {
|
||||
img(r, c) = dist(rng);
|
||||
}
|
||||
}
|
||||
frames.push_back(std::move(img));
|
||||
}
|
||||
|
||||
NDArray<uint32_t, 3> expected({rows, cols, n_bins}, 0u);
|
||||
const float inv_range = static_cast<float>(n_bins) / (xmax - xmin);
|
||||
for (const auto& img : frames) {
|
||||
for (ssize_t r = 0; r < rows; ++r) {
|
||||
for (ssize_t c = 0; c < cols; ++c) {
|
||||
const float v = img(r, c);
|
||||
if (!(v >= xmin) || !(v < xmax)) continue;
|
||||
int bin = static_cast<int>((v - xmin) * inv_range);
|
||||
if (bin >= n_bins) bin = n_bins - 1;
|
||||
if (bin < 0) bin = 0;
|
||||
expected(r, c, bin) += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& img : frames) {
|
||||
hist.fill(img.view());
|
||||
}
|
||||
auto h = hist.hdata();
|
||||
|
||||
REQUIRE(h.shape(0) == rows);
|
||||
REQUIRE(h.shape(1) == cols);
|
||||
REQUIRE(h.shape(2) == n_bins);
|
||||
|
||||
bool all_match = true;
|
||||
for (ssize_t r = 0; r < rows && all_match; ++r) {
|
||||
for (ssize_t c = 0; c < cols && all_match; ++c) {
|
||||
for (ssize_t b = 0; b < n_bins && all_match; ++b) {
|
||||
if (h(r, c, b) != expected(r, c, b)) {
|
||||
all_match = false;
|
||||
INFO("n_threads=" << n_threads << " r=" << r
|
||||
<< " c=" << c << " b=" << b
|
||||
<< " got=" << h(r, c, b)
|
||||
<< " expected=" << expected(r, c, b));
|
||||
CHECK(h(r, c, b) == expected(r, c, b));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
CHECK(all_match);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user