row bug and memcpy
Build on RHEL8 / build (push) Failing after 1m20s
Build on RHEL9 / build (push) Failing after 1m24s
Run tests using data on local RHEL8 / build (push) Failing after 1m55s

This commit is contained in:
Erik Frojdh
2026-05-24 07:47:05 +02:00
parent 61b765c4b8
commit ee6b6dc990
3 changed files with 167 additions and 22 deletions
+7 -2
View File
@@ -31,7 +31,12 @@ class PixelHistogram {
const AxisType xmin_;
const AxisType xmax_;
std::vector<Hist> partial_hists_;
// Cumulative row offsets so that thread t owns rows
// [row_offsets_[t], row_offsets_[t + 1])
// Length is n_threads_ + 1; partition is balanced (the first
// rows_ % n_threads_ threads get one extra row each).
std::vector<int> row_offsets_;
// Thread pool members
std::vector<std::thread> workers_;
std::mutex work_mutex_;
@@ -41,7 +46,7 @@ class PixelHistogram {
std::atomic<int> completed_threads_;
std::atomic<bool> stop_workers_;
int work_generation_;
// Private worker thread method
void worker_loop(int thread_id);
int row_start(int thread_id) const;
+31 -20
View File
@@ -23,7 +23,23 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub
n_threads_ = std::min(n_threads_, rows_);
// Build a balanced row partition. With base = rows_ / n_threads_ and
// extra = rows_ % n_threads_, the first `extra` threads get base + 1
// rows each and the rest get `base` rows. This avoids the
// ceil(rows/n_threads) scheme leaving trailing threads with zero or
// negative row counts (e.g. rows=17, n_threads=8).
row_offsets_.resize(n_threads_ + 1);
const int base = rows_ / n_threads_;
const int extra = rows_ % n_threads_;
int offset = 0;
for (int i = 0; i < n_threads_; ++i) {
row_offsets_[i] = offset;
offset += base + (i < extra ? 1 : 0);
}
row_offsets_[n_threads_] = offset; // == rows_ by construction
// Initialize partial histograms for each thread
partial_hists_.reserve(n_threads_);
for (int i = 0; i < n_threads_; ++i) {
const auto local_rows = row_count(i);
partial_hists_.emplace_back(
@@ -33,7 +49,7 @@ PixelHistogram::PixelHistogram(int rows, int cols, int n_bins, double xmin, doub
bh::axis::integer<int, bh::use_default, bh::axis::option::none_t>(0, local_rows, "x")
);
}
// Spawn worker threads
for (int i = 0; i < n_threads_; ++i) {
workers_.emplace_back([this, i]() { this->worker_loop(i); });
@@ -57,14 +73,11 @@ PixelHistogram::~PixelHistogram() {
}
int PixelHistogram::row_start(int thread_id) const {
const int rows_per_thread = (rows_ + n_threads_ - 1) / n_threads_;
return thread_id * rows_per_thread;
return row_offsets_[thread_id];
}
int PixelHistogram::row_count(int thread_id) const {
const int start = row_start(thread_id);
const int end = std::min(start + (rows_ + n_threads_ - 1) / n_threads_, rows_);
return end - start;
return row_offsets_[thread_id + 1] - row_offsets_[thread_id];
}
void PixelHistogram::worker_loop(int thread_id) {
@@ -120,24 +133,22 @@ NDArray<PixelHistogram::StorageType, 3> PixelHistogram::hdata() const {
NDArray<StorageType, 3> data({rows, cols, bins});
// Merge all partial histograms into data array
std::memset(data.data(), 0, data.total_bytes());
// Each thread owns a disjoint, contiguous range of rows and its dense
// storage layout [local_row][col][bin] is identical to the slice
// [first_row .. first_row + local_rows)[col][bin] of `data`. So the
// merge is just one bulk copy per thread; no per-element accumulation
// and no upfront zeroing of `data` is needed.
const size_t pixel_stride = static_cast<size_t>(cols) * bins;
for (int t = 0; t < n_threads_; ++t) {
const auto &storage = bh::unsafe_access::storage(partial_hists_[t]);
const StorageType *partial_data = storage.data();
const auto first_row = static_cast<ssize_t>(row_start(t));
const auto local_rows = static_cast<ssize_t>(row_count(t));
const auto first_row = static_cast<size_t>(row_start(t));
const auto local_rows = static_cast<size_t>(row_count(t));
if (local_rows == 0) continue;
for (ssize_t local_row = 0; local_row < local_rows; ++local_row) {
for (ssize_t col = 0; col < cols; ++col) {
const auto src_offset = (local_row * cols + col) * bins;
const auto dst_offset = ((first_row + local_row) * cols + col) * bins;
for (ssize_t bin = 0; bin < bins; ++bin) {
data.data()[dst_offset + bin] += partial_data[src_offset + bin];
}
}
}
std::memcpy(data.data() + first_row * pixel_stride,
partial_data,
local_rows * pixel_stride * sizeof(StorageType));
}
return data;
+129
View File
@@ -1,6 +1,9 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators.hpp>
#include <cstdint>
#include <random>
#include <vector>
#include "test_config.hpp"
#include "test_macros.hpp"
@@ -77,3 +80,129 @@ TEST_CASE("Fill pixels with uneven partial histogram row slices"){
}
}
}
TEST_CASE("Row partitioning handles rows < n_threads * ceil(rows/n_threads)") {
// Regression test for the pre-existing row partitioning bug: with the
// old ceil(rows / n_threads) scheme, rows=17 and n_threads=8 left
// trailing threads with negative row counts and threw
// "stop >= start required" from boost::histogram during construction.
// The current scheme distributes the remainder so every thread gets
// at least floor(rows / n_threads) rows. We also exercise the case
// where n_threads > rows, which is clamped down to rows.
const auto p = GENERATE(table<int, int>({
{17, 8}, // previously broken: 8 threads * ceil(17/8) = 24 > 17
{17, 5}, // previously broken: 5 threads * ceil(17/5) = 20 > 17
{13, 4}, // previously broken: 4 * ceil(13/4) = 16 > 13
{17, 32}, // n_threads > rows -> clamped to rows
{1, 8}, // n_threads > rows -> single thread, single row
{6, 6}, // balanced
}));
const int rows = std::get<0>(p);
const int n_threads = std::get<1>(p);
const int cols = 3;
const int n_bins = 4;
constexpr float xmin = 0.0f;
constexpr float xmax = 4.0f;
PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads);
// Put one in-range value per row so we can verify every row is covered.
NDArray<float, 2> img({rows, cols}, -1.0f); // -1 is out of range
for (ssize_t r = 0; r < rows; ++r) {
img(r, 0) = static_cast<float>(r % n_bins) + 0.5f;
}
hist.fill(img.view());
auto h = hist.hdata();
REQUIRE(h.shape(0) == rows);
REQUIRE(h.shape(1) == cols);
REQUIRE(h.shape(2) == n_bins);
for (ssize_t r = 0; r < rows; ++r) {
const int expected_bin = static_cast<int>(r % n_bins);
for (ssize_t c = 0; c < cols; ++c) {
for (ssize_t b = 0; b < n_bins; ++b) {
const uint16_t want =
(c == 0 && b == expected_bin) ? uint16_t{1} : uint16_t{0};
if (h(r, c, b) != want) {
INFO("rows=" << rows << " n_threads=" << n_threads
<< " r=" << r << " c=" << c << " b=" << b
<< " got=" << h(r, c, b) << " want=" << want);
CHECK(h(r, c, b) == want);
}
}
}
}
}
TEST_CASE("Random fills match a reference implementation") {
// End-to-end correctness check: compare the implementation against a
// simple per-pixel reference for several thread counts. Values are
// sampled slightly wider than [xmin, xmax) so the out-of-range filter
// is also exercised.
constexpr int rows = 17;
constexpr int cols = 23;
constexpr int n_bins = 32;
constexpr float xmin = -1.5f;
constexpr float xmax = 4.5f;
const int n_threads = GENERATE(1, 2, 4, 8);
PixelHistogram hist(rows, cols, n_bins, xmin, xmax, n_threads);
std::mt19937 rng(0xC0FFEE);
std::uniform_real_distribution<float> dist(xmin - 0.5f, xmax + 0.5f);
constexpr int n_frames = 4;
std::vector<NDArray<float, 2>> frames;
frames.reserve(n_frames);
for (int f = 0; f < n_frames; ++f) {
NDArray<float, 2> img({rows, cols}, 0.0f);
for (ssize_t r = 0; r < rows; ++r) {
for (ssize_t c = 0; c < cols; ++c) {
img(r, c) = dist(rng);
}
}
frames.push_back(std::move(img));
}
NDArray<uint32_t, 3> expected({rows, cols, n_bins}, 0u);
const float inv_range = static_cast<float>(n_bins) / (xmax - xmin);
for (const auto& img : frames) {
for (ssize_t r = 0; r < rows; ++r) {
for (ssize_t c = 0; c < cols; ++c) {
const float v = img(r, c);
if (!(v >= xmin) || !(v < xmax)) continue;
int bin = static_cast<int>((v - xmin) * inv_range);
if (bin >= n_bins) bin = n_bins - 1;
if (bin < 0) bin = 0;
expected(r, c, bin) += 1;
}
}
}
for (const auto& img : frames) {
hist.fill(img.view());
}
auto h = hist.hdata();
REQUIRE(h.shape(0) == rows);
REQUIRE(h.shape(1) == cols);
REQUIRE(h.shape(2) == n_bins);
bool all_match = true;
for (ssize_t r = 0; r < rows && all_match; ++r) {
for (ssize_t c = 0; c < cols && all_match; ++c) {
for (ssize_t b = 0; b < n_bins && all_match; ++b) {
if (h(r, c, b) != expected(r, c, b)) {
all_match = false;
INFO("n_threads=" << n_threads << " r=" << r
<< " c=" << c << " b=" << b
<< " got=" << h(r, c, b)
<< " expected=" << expected(r, c, b));
CHECK(h(r, c, b) == expected(r, c, b));
}
}
}
}
CHECK(all_match);
}