Files
Jungfraujoch/image_analysis/spot_finding/ImageSpotFinderGPU.cu
2025-10-20 20:43:44 +02:00

280 lines
14 KiB
Plaintext

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
// GPU Spot finding developed by Hans-Christian Stadler (PSI)
// Copyright (2019-2023) Paul Scherrer Institute
#include "ImageSpotFinderGPU.h"
#include "../../common/JFJochException.h"
struct spot_parameters {
int32_t width;
int32_t height;
float strong_pixel_threshold2;
int32_t count_threshold;
};
// input X x Y pixels array
// output X x Y bit array
static constexpr int WARP_SIZE = 32; // assume warp size of 32 cuda threads per warp
inline void cuda_err(cudaError_t val) {
if (val != cudaSuccess)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}
// Write pixel results to bit array
// params: spot finding parameters
// out: pixel result bit array
// pixel: flat pixel index = bit index into bit array
// val: pixel result
// **NOTE**: assumes sizeof(*out) * 8 == WARP_SIZE
__device__ __forceinline__ void write_result(const spot_parameters& params, uint32_t* out, int32_t pixel, uint8_t val)
{
static_assert(sizeof(*out) * 8 == WARP_SIZE, "Violation of essential implementation assumption: WARP_SIZE must match output array element type bit size!");
static constexpr unsigned ALL_THREADS = UINT32_MAX;
const int32_t laneid = threadIdx.x & (WARP_SIZE - 1);
unsigned result = __ballot_sync(ALL_THREADS, val);
const int32_t idx = pixel / WARP_SIZE; // global uint32_t index
const int32_t bit = pixel % WARP_SIZE; // local bit index
if ((bit >= laneid) && (laneid == 0)) { // write to upper part of uint32_t
result <<= bit;
if (result)
atomicOr(&out[idx], result);
} else if ((bit < laneid) && (bit == 0)) { // write to lower part of uint32_t
result >>= laneid;
if (result)
atomicOr(&out[idx], result);
}
}
// Determine if pixel could be a spot
// params: spot finding parameters
// val: pixel value
// sum: window sum
// sum2: window sum of squares
// count: window valid pixels count
// return the pixel result: 0-no spot / 1-spot candidate
__device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, const int64_t val, int64_t sum, int64_t sum2, int64_t count)
{
sum -= val;
sum2 -= val * val;
count -= 1;
const int64_t var = count * sum2 - (sum * sum); // This should be divided by ((2*NBX+1) * (2*NBY+1)-1)*((2*NBX+1) * (2*NBY+1))
const int64_t in_minus_mean = val * count - sum; // Should be divided by ((2*NBX+1) * (2*NBY+1));
const int64_t tmp1 = in_minus_mean * in_minus_mean;
const float tmp2 = var * params.strong_pixel_threshold2;
bool snr_criterion;
if (params.strong_pixel_threshold2 == 0.0f)
snr_criterion = true;
else
snr_criterion = (count > ImageSpotFinder::MIN_VALID_PIXELS) && (in_minus_mean > 0) && (tmp1 > tmp2);
bool count_criterion = (params.count_threshold == 0.0f) || (val > params.count_threshold);
bool strong_pixel = snr_criterion && count_criterion;
if (val == INT32_MAX)
strong_pixel = true;
else if (val == INT32_MIN)
strong_pixel = false;
return strong_pixel ? 1 : 0;
}
// Find pixels that could be spots
// in: image input values
// out: pixel result bit array, 1 bit per pixel (0:no/1:candidate spot)
// params: spot finding parameters
//
// The algorithm uses multiple waves (blockDim.y) that run over sections of rows.
// Each wave will write output at the back row and read input at the front row.
// Each wave is split into column output sections (blockDim.x)
// A wave section (block) is responsible for a particular row/column section and
// maintains sum/sum2/count values per column for the output row.
// Every cuda thread is associated with a particular column. The thread maintains
// the sum/sum2/count values in shared memory for it's column. To do this, the input
// pixel values for the hight of the aggregation window are saved in shared memory.
__global__ void analyze_pixel(const int32_t *in, uint32_t *prev_out, uint32_t *out, const spot_parameters params)
{
// assumption: 2 * params.nby + 1 <= params.rows and 2 * params.nbx + 1 <= params.width
const int32_t window = 2 * (int)ImageSpotFinder::NBX + 1; // vertical window
const int32_t writeSize = blockDim.x - 2 * ImageSpotFinder::NBX; // output columns per block
const int32_t cmin = blockIdx.x * writeSize; // lowest output column
const int32_t cmax = min(cmin + writeSize, static_cast<int32_t>(params.width)); // past highest output column
const int32_t col = cmin + threadIdx.x - ImageSpotFinder::NBX; // thread -> column mapping
const bool data_col = (col >= 0) && (col < static_cast<int32_t>(params.width)); // read global mem
const bool result_col = (col >= cmin) && (col < cmax); // write result
const int32_t nWaves = gridDim.y; // number of waves
const int32_t rowsPerWave = (params.height + nWaves - 1) / nWaves; // rows per wave
const int32_t rmin = blockIdx.y * rowsPerWave; // lowest result row for this wave
const int32_t rmax = min(rmin + rowsPerWave, static_cast<int32_t>(params.height)); // past highest result row for this wave
const int32_t left = max(static_cast<int32_t>(threadIdx.x) - static_cast<int32_t>(ImageSpotFinder::NBX), 0); // leftmost column touched by this thread
const int32_t right = min(static_cast<int32_t>(threadIdx.x) + static_cast<int32_t>(ImageSpotFinder::NBX) + 1, static_cast<int32_t>(params.width)); // past rightmost column touched by this thread
int32_t back = rmin; // back of wave for writing
int32_t front = max(back - static_cast<int32_t>(ImageSpotFinder::NBX), 0); // front of wave for reading (needs to overtake back initially)
extern __shared__ int64_t shared_mem[];
int64_t* shared_sum = shared_mem; // shared buffer [blockDim.x]
int64_t* shared_sum2 = &shared_sum[blockDim.x]; // shared buffer [blockDim.x]
auto shared_count = reinterpret_cast<int16_t*>(&shared_sum2[blockDim.x]); // shared buffer [blockDim.x]
auto shared_val = reinterpret_cast<int32_t *>(&shared_count[blockDim.x]); // shared cyclic buffer [window, blockDim.x]
int64_t total_sum; // totals
int64_t total_sum2;
int32_t total_count;
// initialize sum, sum2, count, val buffers
shared_sum[threadIdx.x] = 0; // shared values without effect on totals
shared_sum2[threadIdx.x] = 0;
shared_count[threadIdx.x] = 0;
for (int i=0; i<window; i++)
shared_val[i * blockDim.x + threadIdx.x] = INT32_MIN; // value that is NOT counted
// wave front up to rmin + nby + 1
do {
if (data_col) { // read at the front end of the wave
const int32_t npixel = front * params.width + col;
const bool sat = ((prev_out[npixel / 32] & (1U << (npixel % 32))) != 0);
const int32_t val = sat ? INT32_MAX : in[npixel];
shared_val[(front % window) * blockDim.x + threadIdx.x] = val;
if (val != INT32_MAX && val != INT32_MIN) {
shared_sum[threadIdx.x] += val;
shared_sum2[threadIdx.x] += val * val;
shared_count[threadIdx.x] += 1;
}
}
front++;
} while (front < rmin + static_cast<int32_t>(ImageSpotFinder::NBX) + 1);
// wave front up to rmax
do {
__syncthreads(); // make others see the shared values
uint8_t val = 0;
if (result_col) { // write at the back end of the wave
total_sum = total_sum2 = total_count = 0;
for (auto j = left; j < right; j++) {
total_sum += shared_sum[j];
total_sum2 += shared_sum2[j];
total_count += shared_count[j];
}
val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
}
write_result(params, out, back * params.width + col, val);
back++;
__syncthreads(); // keep shared values until others have seen them
if (data_col) { // read at the front end of the wave
int16_t cnt = 0;
int32_t old = shared_val[(front % window) * blockDim.x + threadIdx.x];
if (old == INT32_MAX || old == INT32_MIN) {
old = 0; // no effect value
cnt = 1; // bring count to normal
}
int32_t val = in[front * params.width + col];
shared_val[(front % window) * blockDim.x + threadIdx.x] = val;
if (val == INT32_MAX || val == INT32_MIN) {
val = 0; // no effect value
cnt -= 1; // count diff from normal
}
shared_sum[threadIdx.x] += val - old;
shared_sum2[threadIdx.x] += val * val - old * old;
shared_count[threadIdx.x] += cnt;
}
front++;
} while (front < rmax);
// wave back up to rmax
do {
__syncthreads(); // make others see the shared values
uint8_t val = 0;
if (result_col) { // write at the back end of the wave
total_sum = total_sum2 = total_count = 0;
for (auto j = left; j < right; j++) {
total_sum += shared_sum[j];
total_sum2 += shared_sum2[j];
total_count += shared_count[j];
}
val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
}
write_result(params, out, back * params.width + col, val);
back++;
__syncthreads(); // keep shared values until others have seen them
if (data_col) { // read at the front end of the wave if possible
int16_t cnt = -1; // normal count diff
int32_t old = shared_val[(front % window) * blockDim.x + threadIdx.x];
if (old == INT32_MAX || old == INT32_MIN) {
old = 0; // no effect value
cnt += 1; // bring count to normal
}
int32_t val = 0;
if (front < params.height) {
const int32_t npixel = front * params.width + col;
const bool sat = ((prev_out[npixel / 32] & (1U << (npixel % 32))) != 0);
val = sat ? INT32_MAX : in[npixel];
if (val == INT32_MAX || val == INT32_MIN)
val = 0; // no effect value
else
cnt += 1; // count diff from normal
}
shared_sum[threadIdx.x] += val - old;
shared_sum2[threadIdx.x] += val * val - old * old;
shared_count[threadIdx.x] += cnt;
}
front++;
} while (back < rmax);
}
ImageSpotFinderGPU::ImageSpotFinderGPU(int32_t in_width, int32_t in_height) :
ImageSpotFinder(in_width, in_height),
input_buffer_reg(input_buffer),
output_buffer_reg(output_buffer) {
gpu_in = CudaDevicePtr<int32_t>(width * height);
gpu_out_0 = CudaDevicePtr<uint32_t>(OutputSize());
gpu_out_1 = CudaDevicePtr<uint32_t>(OutputSize());
}
std::vector<DiffractionSpot> ImageSpotFinderGPU::Run(const SpotFindingSettings &settings, const std::vector<bool> &res_mask) {
spot_parameters spot_params{};
spot_params.height = height;
spot_params.width = width;
spot_params.strong_pixel_threshold2 = settings.signal_to_noise_threshold * settings.signal_to_noise_threshold;
spot_params.count_threshold = settings.photon_count_threshold;
if (2 * NBX + 1 > windowSizeLimit)
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nbx exceeds window size limit");
if (2 * NBX + 1 > windowSizeLimit)
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nby exceeds window size limit");
if (windowSizeLimit > numberOfCudaThreads)
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of cuda threads");
if (windowSizeLimit > spot_params.width)
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of columns");
if (windowSizeLimit > spot_params.height)
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of height");
const auto nWriters = numberOfCudaThreads - 2 * NBX;
const auto nBlocks = (spot_params.width + nWriters - 1) / nWriters;
const auto window = 2 * NBX + 1;
const auto sharedSize = (2 * sizeof(int64_t) + // sum, sum2
window * sizeof(int32_t) + // val
1 * sizeof(int16_t) // count
) * numberOfCudaThreads;
const dim3 blocks(nBlocks, numberOfWaves);
cuda_err(cudaMemcpyAsync(gpu_in, input_buffer.data(), width * height * sizeof(int32_t), cudaMemcpyHostToDevice, stream));
cuda_err(cudaMemsetAsync(gpu_out_0, 0, OutputSize() * sizeof(uint32_t), stream));
cuda_err(cudaMemsetAsync(gpu_out_1, 0, OutputSize() * sizeof(uint32_t), stream));
analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, stream>>>
(gpu_in, gpu_out_1, gpu_out_0, spot_params);
analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, stream>>>
(gpu_in, gpu_out_0, gpu_out_1, spot_params);
cuda_err(cudaMemcpyAsync(output_buffer.data(), gpu_out_1, OutputSize() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
cuda_err(cudaStreamSynchronize(stream));
return ExtractSpots(settings, res_mask);
}