Files
Jungfraujoch/image_analysis/azint/AzIntEngineGPU.cu
T
leonarski_f d760b12a18
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 9m32s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 10m19s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 11m49s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 12m37s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 12m23s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 11m44s
Build Packages / build:rpm (rocky8) (push) Successful in 9m15s
Build Packages / build:rpm (rocky9) (push) Successful in 12m14s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m28s
Build Packages / Generate python client (push) Successful in 21s
Build Packages / XDS test (durin plugin) (push) Successful in 8m58s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m7s
Build Packages / Build documentation (push) Successful in 42s
Build Packages / Create release (push) Has been skipped
Build Packages / XDS test (JFJoch plugin) (push) Successful in 8m24s
Build Packages / DIALS test (push) Successful in 14m1s
Build Packages / XDS test (neggia plugin) (push) Successful in 7m26s
Build Packages / Unit tests (push) Failing after 1h20m14s
v1.0.0-rc.141 (#51)
This is an UNSTABLE release. The release has significant modifications and bug fixes, if things go wrong, it is better to revert to 1.0.0-rc.132.

* jfjoch_broker: Azimuthal integration mapping is generated with parallel computations, significantly reducing setup times
* frontend: Fix selection of FFTW in indexing settings

Reviewed-on: #51
Co-authored-by: Filip Leonarski <filip.leonarski@psi.ch>
Co-committed-by: Filip Leonarski <filip.leonarski@psi.ch>
2026-04-30 13:04:54 +02:00

144 lines
5.0 KiB
Plaintext

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "AzIntEngineGPU.h"
inline void cuda_err(cudaError_t val) {
if (val != cudaSuccess)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}
__global__
void gpu_azim_shared(
const uint16_t *__restrict__ pixel_to_bin,
const float *__restrict__ corrections,
const int32_t *__restrict__ input_buffer,
float *__restrict__ azint_sum,
float *__restrict__ azint_sum2,
uint32_t *__restrict__ azint_count,
size_t num_pixels,
int azint_bins) {
extern __shared__ float shared[];
float *s_sum = shared;
float *s_sum2 = &s_sum[azint_bins];
uint32_t *s_count = (uint32_t *) &s_sum2[azint_bins];
// Initialize shared memory
for (int i = threadIdx.x; i < azint_bins; i += blockDim.x) {
s_sum[i] = 0.0f;
s_sum2[i] = 0.0f;
s_count[i] = 0;
}
__syncthreads();
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < num_pixels;
idx += blockDim.x * gridDim.x) {
uint16_t bin = pixel_to_bin[idx];
int32_t v = input_buffer[idx];
bool valid = (v != INT32_MIN) & (v != INT32_MAX);
if (bin < azint_bins && valid) {
const float val = static_cast<float>(v) * corrections[idx];
const float val2 = val * val;
atomicAdd(&s_sum[bin], val);
atomicAdd(&s_sum2[bin], val2);
atomicAdd(&s_count[bin], 1);
}
}
__syncthreads();
// Merge to global memory
for (unsigned int i = threadIdx.x; i < azint_bins; i += blockDim.x) {
atomicAdd(&azint_sum[i], s_sum[i]);
atomicAdd(&azint_sum2[i], s_sum2[i]);
atomicAdd(&azint_count[i], s_count[i]);
}
}
__global__
void gpu_azim(
const uint16_t *__restrict__ pixel_to_bin,
const float *__restrict__ corrections,
const int32_t *__restrict__ input_buffer,
float *__restrict__ azint_sum,
float *__restrict__ azint_sum2,
uint32_t *__restrict__ azint_count,
size_t num_pixels,
int azint_bins) {
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < num_pixels;
idx += blockDim.x * gridDim.x) {
uint16_t bin = pixel_to_bin[idx];
int32_t v = input_buffer[idx];
bool valid = (v != INT32_MIN) & (v != INT32_MAX);
if (bin < azint_bins && valid) {
const float val = static_cast<float>(v) * corrections[idx];
const float val2 = val * val;
atomicAdd(&azint_sum[bin], val);
atomicAdd(&azint_sum2[bin], val2);
atomicAdd(&azint_count[bin], 1);
}
}
}
AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegrationMapping &integration, std::shared_ptr<CudaStream> stream)
: AzIntEngine(integration),
stream(stream),
gpu_azint_correction(npixel),
gpu_pixel_to_bin(npixel),
gpu_sum(azint_bins),
gpu_sum2(azint_bins),
gpu_count(azint_bins),
cpu_sum_reg(azint_sum),
cpu_sum2_reg(azint_sum2),
cpu_count_reg(azint_count) {
cudaDeviceProp prop{};
cudaGetDeviceProperties(&prop, 0);
threads = 128;
blocks = 4 * prop.multiProcessorCount;
shared_size = prop.sharedMemPerBlock;
shared_needed = azint_bins * (2 * sizeof(float) + sizeof(uint32_t));
cudaMemcpy(gpu_azint_correction, integration.Corrections().data(), sizeof(float) * npixel,
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_pixel_to_bin, integration.GetPixelToBin().data(), sizeof(uint16_t) * npixel,
cudaMemcpyHostToDevice);
}
void AzIntEngineGPU::Run(const ImagePreprocessorBuffer &image, AzimuthalIntegrationProfile &profile) {
if (image.size() != integration.GetPixelToBin().size())
throw std::runtime_error("ImageSpotFinder::AzimIntegration: Mismatch in size");
cuda_err(cudaMemsetAsync(gpu_sum, 0, sizeof(float) * azint_bins, *stream));
cuda_err(cudaMemsetAsync(gpu_sum2, 0, sizeof(float) * azint_bins, *stream));
cuda_err(cudaMemsetAsync(gpu_count, 0, sizeof(uint32_t) * azint_bins, *stream));
if (shared_needed < shared_size) {
gpu_azim_shared<<<blocks, threads, shared_needed, *stream>>>(
gpu_pixel_to_bin,gpu_azint_correction,image.getGPUBuffer(), gpu_sum, gpu_sum2,
gpu_count, npixel, azint_bins
);
} else {
gpu_azim<<<blocks, threads, 0, *stream>>>(
gpu_pixel_to_bin,gpu_azint_correction,image.getGPUBuffer(), gpu_sum, gpu_sum2,
gpu_count, npixel, azint_bins
);
}
cudaMemcpyAsync(azint_sum.data(), gpu_sum, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream);
cudaMemcpyAsync(azint_sum2.data(), gpu_sum2, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream);
cudaMemcpyAsync(azint_count.data(), gpu_count, sizeof(uint32_t) * azint_bins, cudaMemcpyDeviceToHost, *stream);
cuda_err(cudaStreamSynchronize(*stream));
profile.Clear(integration);
profile.Add(azint_sum, azint_count);
}