From ee6b6dc990b06100a2c2a2cda140930d7a6bdeed Mon Sep 17 00:00:00 2001
From: Erik Frojdh <erik.frojdh@psi.ch>
Date: Sun, 24 May 2026 07:47:05 +0200
Subject: [PATCH] row bug and memcpy

---
 include/aare/PixelHistogram.hpp |   9 ++-
 src/PixelHistogram.cpp          |  51 ++++++++-----
 src/PixelHistogram.test.cpp     | 129 ++++++++++++++++++++++++++++++++
 3 files changed, 167 insertions(+), 22 deletions(-)
diff --git a/include/aare/PixelHistogram.hpp b/include/aare/PixelHistogram.hpp
index c37cf34..ae72f15 100644
--- a/include/aare/PixelHistogram.hpp
+++ b/include/aare/PixelHistogram.hpp
@@ -31,7 +31,12 @@ class PixelHistogram {
     const AxisType xmin_;
     const AxisType xmax_;
     std::vector<Hist> partial_hists_;
-    
+    // Cumulative row offsets so that thread t owns rows
+    //     [row_offsets_[t], row_offsets_[t + 1])
+    // Length is n_threads_ + 1; partition is balanced (the first
+    // rows_ % n_threads_ threads get one extra row each).
+    std::vector<int> row_offsets_;
+
     // Thread pool members
     std::vector<std::thread> workers_;
     std::mutex work_mutex_;
@@ -41,7 +46,7 @@ class PixelHistogram {
     std::atomic<int> completed_threads_;
     std::atomic<bool> stop_workers_;
     int work_generation_;
-    
+
     // Private worker thread method
     void worker_loop(int thread_id);
     int row_start(int thread_id) const;
diff --git a/src/PixelHistogram.cpp b/src/PixelHistogram.cpp
index 8f54b8e..228a225 100644
--- a/src/PixelHistogram.cpp
+++ b/src/PixelHistogram.cpp
@@ -23,7 +23,23 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub
 
     n_threads_ = std::min(n_threads_, rows_);
 
+    // Build a balanced row partition. With base = rows_ / n_threads_ and
+    // extra = rows_ % n_threads_, the first `extra` threads get base + 1
+    // rows each and the rest get `base` rows. This avoids the
+    // ceil(rows/n_threads) scheme leaving trailing threads with zero or
+    // negative row counts (e.g. rows=17, n_threads=8).
+    row_offsets_.resize(n_threads_ + 1);
+    const int base  = rows_ / n_threads_;
+    const int extra = rows_ % n_threads_;
+    int offset = 0;
+    for (int i = 0; i < n_threads_; ++i) {
+        row_offsets_[i] = offset;
+        offset += base + (i < extra ? 1 : 0);
+    }
+    row_offsets_[n_threads_] = offset; // == rows_ by construction
+
     // Initialize partial histograms for each thread
+    partial_hists_.reserve(n_threads_);
     for (int i = 0; i < n_threads_; ++i) {
         const auto local_rows = row_count(i);
         partial_hists_.emplace_back(
@@ -33,7 +49,7 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub
             bh::axis::integer<int, bh::use_default, bh::axis::option::none_t>(0, local_rows, "x")
         );
     }
-    
+
     // Spawn worker threads
     for (int i = 0; i < n_threads_; ++i) {
         workers_.emplace_back([this, i]() { this->worker_loop(i); });
@@ -57,14 +73,11 @@ PixelHistogram::~PixelHistogram() {
 }
 
 int PixelHistogram::row_start(int thread_id) const {
-    const int rows_per_thread = (rows_ + n_threads_ - 1) / n_threads_;
-    return thread_id * rows_per_thread;
+    return row_offsets_[thread_id];
 }
 
 int PixelHistogram::row_count(int thread_id) const {
-    const int start = row_start(thread_id);
-    const int end = std::min(start + (rows_ + n_threads_ - 1) / n_threads_, rows_);
-    return end - start;
+    return row_offsets_[thread_id + 1] - row_offsets_[thread_id];
 }
 
 void PixelHistogram::worker_loop(int thread_id) {
@@ -120,24 +133,22 @@ NDArray<PixelHistogram::StorageType, 3> PixelHistogram::hdata() const {
 
     NDArray<StorageType, 3> data({rows, cols, bins});
 
-    // Merge all partial histograms into data array
-    std::memset(data.data(), 0, data.total_bytes());
-    
+    // Each thread owns a disjoint, contiguous range of rows and its dense
+    // storage layout [local_row][col][bin] is identical to the slice
+    // [first_row .. first_row + local_rows)[col][bin] of `data`. So the
+    // merge is just one bulk copy per thread; no per-element accumulation
+    // and no upfront zeroing of `data` is needed.
+    const size_t pixel_stride = static_cast<size_t>(cols) * bins;
     for (int t = 0; t < n_threads_; ++t) {
         const auto &storage = bh::unsafe_access::storage(partial_hists_[t]);
         const StorageType *partial_data = storage.data();
-        const auto first_row = static_cast<ssize_t>(row_start(t));
-        const auto local_rows = static_cast<ssize_t>(row_count(t));
+        const auto first_row  = static_cast<size_t>(row_start(t));
+        const auto local_rows = static_cast<size_t>(row_count(t));
+        if (local_rows == 0) continue;
 
-        for (ssize_t local_row = 0; local_row < local_rows; ++local_row) {
-            for (ssize_t col = 0; col < cols; ++col) {
-                const auto src_offset = (local_row * cols + col) * bins;
-                const auto dst_offset = ((first_row + local_row) * cols + col) * bins;
-                for (ssize_t bin = 0; bin < bins; ++bin) {
-                    data.data()[dst_offset + bin] += partial_data[src_offset + bin];
-                }
-            }
-        }
+        std::memcpy(data.data() + first_row * pixel_stride,
+                    partial_data,
+                    local_rows * pixel_stride * sizeof(StorageType));
     }
 
     return data;
diff --git a/src/PixelHistogram.test.cpp b/src/PixelHistogram.test.cpp
index c1acec9..f79dfb7 100644
--- a/src/PixelHistogram.test.cpp
+++ b/src/PixelHistogram.test.cpp
@@ -1,6 +1,9 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators.hpp>
 
+#include <cstdint>
+#include <random>
+#include <vector>
 
 #include "test_config.hpp"
 #include "test_macros.hpp"
@@ -77,3 +80,129 @@ TEST_CASE("Fill pixels with uneven partial histogram row slices"){
         }
     }
 }
+
+TEST_CASE("Row partitioning handles rows < n_threads * ceil(rows/n_threads)") {
+    // Regression test for the pre-existing row partitioning bug: with the
+    // old ceil(rows / n_threads) scheme, rows=17 and n_threads=8 left
+    // trailing threads with negative row counts and threw
+    // "stop >= start required" from boost::histogram during construction.
+    // The current scheme distributes the remainder so every thread gets
+    // at least floor(rows / n_threads) rows. We also exercise the case
+    // where n_threads > rows, which is clamped down to rows.
+    const auto p = GENERATE(table<int, int>({
+        {17, 8},   // previously broken: 8 threads * ceil(17/8) = 24 > 17
+        {17, 5},   // previously broken: 5 threads * ceil(17/5) = 20 > 17
+        {13, 4},   // previously broken: 4 * ceil(13/4) = 16 > 13
+        {17, 32},  // n_threads > rows -> clamped to rows
+        {1, 8},    // n_threads > rows -> single thread, single row
+        {6, 6},    // balanced
+    }));
+    const int rows = std::get<0>(p);
+    const int n_threads = std::get<1>(p);
+    const int cols = 3;
+    const int n_bins = 4;
+    constexpr float xmin = 0.0f;
+    constexpr float xmax = 4.0f;
+
+    PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads);
+
+    // Put one in-range value per row so we can verify every row is covered.
+    NDArray<float, 2> img({rows, cols}, -1.0f); // -1 is out of range
+    for (ssize_t r = 0; r < rows; ++r) {
+        img(r, 0) = static_cast<float>(r % n_bins) + 0.5f;
+    }
+    hist.fill(img.view());
+
+    auto h = hist.hdata();
+    REQUIRE(h.shape(0) == rows);
+    REQUIRE(h.shape(1) == cols);
+    REQUIRE(h.shape(2) == n_bins);
+
+    for (ssize_t r = 0; r < rows; ++r) {
+        const int expected_bin = static_cast<int>(r % n_bins);
+        for (ssize_t c = 0; c < cols; ++c) {
+            for (ssize_t b = 0; b < n_bins; ++b) {
+                const uint16_t want =
+                    (c == 0 && b == expected_bin) ? uint16_t{1} : uint16_t{0};
+                if (h(r, c, b) != want) {
+                    INFO("rows=" << rows << " n_threads=" << n_threads
+                         << " r=" << r << " c=" << c << " b=" << b
+                         << " got=" << h(r, c, b) << " want=" << want);
+                    CHECK(h(r, c, b) == want);
+                }
+            }
+        }
+    }
+}
+
+TEST_CASE("Random fills match a reference implementation") {
+    // End-to-end correctness check: compare the implementation against a
+    // simple per-pixel reference for several thread counts. Values are
+    // sampled slightly wider than [xmin, xmax) so the out-of-range filter
+    // is also exercised.
+    constexpr int rows = 17;
+    constexpr int cols = 23;
+    constexpr int n_bins = 32;
+    constexpr float xmin = -1.5f;
+    constexpr float xmax = 4.5f;
+    const int n_threads = GENERATE(1, 2, 4, 8);
+
+    PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads);
+
+    std::mt19937 rng(0xC0FFEE);
+    std::uniform_real_distribution<float> dist(xmin - 0.5f, xmax + 0.5f);
+
+    constexpr int n_frames = 4;
+    std::vector<NDArray<float, 2>> frames;
+    frames.reserve(n_frames);
+    for (int f = 0; f < n_frames; ++f) {
+        NDArray<float, 2> img({rows, cols}, 0.0f);
+        for (ssize_t r = 0; r < rows; ++r) {
+            for (ssize_t c = 0; c < cols; ++c) {
+                img(r, c) = dist(rng);
+            }
+        }
+        frames.push_back(std::move(img));
+    }
+
+    NDArray<uint32_t, 3> expected({rows, cols, n_bins}, 0u);
+    const float inv_range = static_cast<float>(n_bins) / (xmax - xmin);
+    for (const auto& img : frames) {
+        for (ssize_t r = 0; r < rows; ++r) {
+            for (ssize_t c = 0; c < cols; ++c) {
+                const float v = img(r, c);
+                if (!(v >= xmin) || !(v < xmax)) continue;
+                int bin = static_cast<int>((v - xmin) * inv_range);
+                if (bin >= n_bins) bin = n_bins - 1;
+                if (bin < 0) bin = 0;
+                expected(r, c, bin) += 1;
+            }
+        }
+    }
+
+    for (const auto& img : frames) {
+        hist.fill(img.view());
+    }
+    auto h = hist.hdata();
+
+    REQUIRE(h.shape(0) == rows);
+    REQUIRE(h.shape(1) == cols);
+    REQUIRE(h.shape(2) == n_bins);
+
+    bool all_match = true;
+    for (ssize_t r = 0; r < rows && all_match; ++r) {
+        for (ssize_t c = 0; c < cols && all_match; ++c) {
+            for (ssize_t b = 0; b < n_bins && all_match; ++b) {
+                if (h(r, c, b) != expected(r, c, b)) {
+                    all_match = false;
+                    INFO("n_threads=" << n_threads << " r=" << r
+                         << " c=" << c << " b=" << b
+                         << " got=" << h(r, c, b)
+                         << " expected=" << expected(r, c, b));
+                    CHECK(h(r, c, b) == expected(r, c, b));
+                }
+            }
+        }
+    }
+    CHECK(all_match);
+}