From ee6b6dc990b06100a2c2a2cda140930d7a6bdeed Mon Sep 17 00:00:00 2001 From: Erik Frojdh Date: Sun, 24 May 2026 07:47:05 +0200 Subject: [PATCH] row bug and memcpy --- include/aare/PixelHistogram.hpp | 9 ++- src/PixelHistogram.cpp | 51 ++++++++----- src/PixelHistogram.test.cpp | 129 ++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 22 deletions(-) diff --git a/include/aare/PixelHistogram.hpp b/include/aare/PixelHistogram.hpp index c37cf34..ae72f15 100644 --- a/include/aare/PixelHistogram.hpp +++ b/include/aare/PixelHistogram.hpp @@ -31,7 +31,12 @@ class PixelHistogram { const AxisType xmin_; const AxisType xmax_; std::vector partial_hists_; - + // Cumulative row offsets so that thread t owns rows + // [row_offsets_[t], row_offsets_[t + 1]) + // Length is n_threads_ + 1; partition is balanced (the first + // rows_ % n_threads_ threads get one extra row each). + std::vector row_offsets_; + // Thread pool members std::vector workers_; std::mutex work_mutex_; @@ -41,7 +46,7 @@ class PixelHistogram { std::atomic completed_threads_; std::atomic stop_workers_; int work_generation_; - + // Private worker thread method void worker_loop(int thread_id); int row_start(int thread_id) const; diff --git a/src/PixelHistogram.cpp b/src/PixelHistogram.cpp index 8f54b8e..228a225 100644 --- a/src/PixelHistogram.cpp +++ b/src/PixelHistogram.cpp @@ -23,7 +23,23 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub n_threads_ = std::min(n_threads_, rows_); + // Build a balanced row partition. With base = rows_ / n_threads_ and + // extra = rows_ % n_threads_, the first `extra` threads get base + 1 + // rows each and the rest get `base` rows. This avoids the + // ceil(rows/n_threads) scheme leaving trailing threads with zero or + // negative row counts (e.g. rows=17, n_threads=8). + row_offsets_.resize(n_threads_ + 1); + const int base = rows_ / n_threads_; + const int extra = rows_ % n_threads_; + int offset = 0; + for (int i = 0; i < n_threads_; ++i) { + row_offsets_[i] = offset; + offset += base + (i < extra ? 1 : 0); + } + row_offsets_[n_threads_] = offset; // == rows_ by construction + // Initialize partial histograms for each thread + partial_hists_.reserve(n_threads_); for (int i = 0; i < n_threads_; ++i) { const auto local_rows = row_count(i); partial_hists_.emplace_back( @@ -33,7 +49,7 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub bh::axis::integer(0, local_rows, "x") ); } - + // Spawn worker threads for (int i = 0; i < n_threads_; ++i) { workers_.emplace_back([this, i]() { this->worker_loop(i); }); @@ -57,14 +73,11 @@ PixelHistogram::~PixelHistogram() { } int PixelHistogram::row_start(int thread_id) const { - const int rows_per_thread = (rows_ + n_threads_ - 1) / n_threads_; - return thread_id * rows_per_thread; + return row_offsets_[thread_id]; } int PixelHistogram::row_count(int thread_id) const { - const int start = row_start(thread_id); - const int end = std::min(start + (rows_ + n_threads_ - 1) / n_threads_, rows_); - return end - start; + return row_offsets_[thread_id + 1] - row_offsets_[thread_id]; } void PixelHistogram::worker_loop(int thread_id) { @@ -120,24 +133,22 @@ NDArray PixelHistogram::hdata() const { NDArray data({rows, cols, bins}); - // Merge all partial histograms into data array - std::memset(data.data(), 0, data.total_bytes()); - + // Each thread owns a disjoint, contiguous range of rows and its dense + // storage layout [local_row][col][bin] is identical to the slice + // [first_row .. first_row + local_rows)[col][bin] of `data`. So the + // merge is just one bulk copy per thread; no per-element accumulation + // and no upfront zeroing of `data` is needed. + const size_t pixel_stride = static_cast(cols) * bins; for (int t = 0; t < n_threads_; ++t) { const auto &storage = bh::unsafe_access::storage(partial_hists_[t]); const StorageType *partial_data = storage.data(); - const auto first_row = static_cast(row_start(t)); - const auto local_rows = static_cast(row_count(t)); + const auto first_row = static_cast(row_start(t)); + const auto local_rows = static_cast(row_count(t)); + if (local_rows == 0) continue; - for (ssize_t local_row = 0; local_row < local_rows; ++local_row) { - for (ssize_t col = 0; col < cols; ++col) { - const auto src_offset = (local_row * cols + col) * bins; - const auto dst_offset = ((first_row + local_row) * cols + col) * bins; - for (ssize_t bin = 0; bin < bins; ++bin) { - data.data()[dst_offset + bin] += partial_data[src_offset + bin]; - } - } - } + std::memcpy(data.data() + first_row * pixel_stride, + partial_data, + local_rows * pixel_stride * sizeof(StorageType)); } return data; diff --git a/src/PixelHistogram.test.cpp b/src/PixelHistogram.test.cpp index c1acec9..f79dfb7 100644 --- a/src/PixelHistogram.test.cpp +++ b/src/PixelHistogram.test.cpp @@ -1,6 +1,9 @@ #include #include +#include +#include +#include #include "test_config.hpp" #include "test_macros.hpp" @@ -77,3 +80,129 @@ TEST_CASE("Fill pixels with uneven partial histogram row slices"){ } } } + +TEST_CASE("Row partitioning handles rows < n_threads * ceil(rows/n_threads)") { + // Regression test for the pre-existing row partitioning bug: with the + // old ceil(rows / n_threads) scheme, rows=17 and n_threads=8 left + // trailing threads with negative row counts and threw + // "stop >= start required" from boost::histogram during construction. + // The current scheme distributes the remainder so every thread gets + // at least floor(rows / n_threads) rows. We also exercise the case + // where n_threads > rows, which is clamped down to rows. + const auto p = GENERATE(table({ + {17, 8}, // previously broken: 8 threads * ceil(17/8) = 24 > 17 + {17, 5}, // previously broken: 5 threads * ceil(17/5) = 20 > 17 + {13, 4}, // previously broken: 4 * ceil(13/4) = 16 > 13 + {17, 32}, // n_threads > rows -> clamped to rows + {1, 8}, // n_threads > rows -> single thread, single row + {6, 6}, // balanced + })); + const int rows = std::get<0>(p); + const int n_threads = std::get<1>(p); + const int cols = 3; + const int n_bins = 4; + constexpr float xmin = 0.0f; + constexpr float xmax = 4.0f; + + PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads); + + // Put one in-range value per row so we can verify every row is covered. + NDArray img({rows, cols}, -1.0f); // -1 is out of range + for (ssize_t r = 0; r < rows; ++r) { + img(r, 0) = static_cast(r % n_bins) + 0.5f; + } + hist.fill(img.view()); + + auto h = hist.hdata(); + REQUIRE(h.shape(0) == rows); + REQUIRE(h.shape(1) == cols); + REQUIRE(h.shape(2) == n_bins); + + for (ssize_t r = 0; r < rows; ++r) { + const int expected_bin = static_cast(r % n_bins); + for (ssize_t c = 0; c < cols; ++c) { + for (ssize_t b = 0; b < n_bins; ++b) { + const uint16_t want = + (c == 0 && b == expected_bin) ? uint16_t{1} : uint16_t{0}; + if (h(r, c, b) != want) { + INFO("rows=" << rows << " n_threads=" << n_threads + << " r=" << r << " c=" << c << " b=" << b + << " got=" << h(r, c, b) << " want=" << want); + CHECK(h(r, c, b) == want); + } + } + } + } +} + +TEST_CASE("Random fills match a reference implementation") { + // End-to-end correctness check: compare the implementation against a + // simple per-pixel reference for several thread counts. Values are + // sampled slightly wider than [xmin, xmax) so the out-of-range filter + // is also exercised. + constexpr int rows = 17; + constexpr int cols = 23; + constexpr int n_bins = 32; + constexpr float xmin = -1.5f; + constexpr float xmax = 4.5f; + const int n_threads = GENERATE(1, 2, 4, 8); + + PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads); + + std::mt19937 rng(0xC0FFEE); + std::uniform_real_distribution dist(xmin - 0.5f, xmax + 0.5f); + + constexpr int n_frames = 4; + std::vector> frames; + frames.reserve(n_frames); + for (int f = 0; f < n_frames; ++f) { + NDArray img({rows, cols}, 0.0f); + for (ssize_t r = 0; r < rows; ++r) { + for (ssize_t c = 0; c < cols; ++c) { + img(r, c) = dist(rng); + } + } + frames.push_back(std::move(img)); + } + + NDArray expected({rows, cols, n_bins}, 0u); + const float inv_range = static_cast(n_bins) / (xmax - xmin); + for (const auto& img : frames) { + for (ssize_t r = 0; r < rows; ++r) { + for (ssize_t c = 0; c < cols; ++c) { + const float v = img(r, c); + if (!(v >= xmin) || !(v < xmax)) continue; + int bin = static_cast((v - xmin) * inv_range); + if (bin >= n_bins) bin = n_bins - 1; + if (bin < 0) bin = 0; + expected(r, c, bin) += 1; + } + } + } + + for (const auto& img : frames) { + hist.fill(img.view()); + } + auto h = hist.hdata(); + + REQUIRE(h.shape(0) == rows); + REQUIRE(h.shape(1) == cols); + REQUIRE(h.shape(2) == n_bins); + + bool all_match = true; + for (ssize_t r = 0; r < rows && all_match; ++r) { + for (ssize_t c = 0; c < cols && all_match; ++c) { + for (ssize_t b = 0; b < n_bins && all_match; ++b) { + if (h(r, c, b) != expected(r, c, b)) { + all_match = false; + INFO("n_threads=" << n_threads << " r=" << r + << " c=" << c << " b=" << b + << " got=" << h(r, c, b) + << " expected=" << expected(r, c, b)); + CHECK(h(r, c, b) == expected(r, c, b)); + } + } + } + } + CHECK(all_match); +}