d760b12a18
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 9m32s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 10m19s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 11m49s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 12m37s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 12m23s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 11m44s
Build Packages / build:rpm (rocky8) (push) Successful in 9m15s
Build Packages / build:rpm (rocky9) (push) Successful in 12m14s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m28s
Build Packages / Generate python client (push) Successful in 21s
Build Packages / XDS test (durin plugin) (push) Successful in 8m58s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m7s
Build Packages / Build documentation (push) Successful in 42s
Build Packages / Create release (push) Has been skipped
Build Packages / XDS test (JFJoch plugin) (push) Successful in 8m24s
Build Packages / DIALS test (push) Successful in 14m1s
Build Packages / XDS test (neggia plugin) (push) Successful in 7m26s
Build Packages / Unit tests (push) Failing after 1h20m14s
This is an UNSTABLE release. The release has significant modifications and bug fixes, if things go wrong, it is better to revert to 1.0.0-rc.132. * jfjoch_broker: Azimuthal integration mapping is generated with parallel computations, significantly reducing setup times * frontend: Fix selection of FFTW in indexing settings Reviewed-on: #51 Co-authored-by: Filip Leonarski <filip.leonarski@psi.ch> Co-committed-by: Filip Leonarski <filip.leonarski@psi.ch>
144 lines
5.0 KiB
Plaintext
144 lines
5.0 KiB
Plaintext
// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "AzIntEngineGPU.h"
|
|
|
|
inline void cuda_err(cudaError_t val) {
|
|
if (val != cudaSuccess)
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
|
|
}
|
|
|
|
__global__
|
|
void gpu_azim_shared(
|
|
const uint16_t *__restrict__ pixel_to_bin,
|
|
const float *__restrict__ corrections,
|
|
const int32_t *__restrict__ input_buffer,
|
|
float *__restrict__ azint_sum,
|
|
float *__restrict__ azint_sum2,
|
|
uint32_t *__restrict__ azint_count,
|
|
size_t num_pixels,
|
|
int azint_bins) {
|
|
extern __shared__ float shared[];
|
|
|
|
float *s_sum = shared;
|
|
float *s_sum2 = &s_sum[azint_bins];
|
|
uint32_t *s_count = (uint32_t *) &s_sum2[azint_bins];
|
|
|
|
// Initialize shared memory
|
|
for (int i = threadIdx.x; i < azint_bins; i += blockDim.x) {
|
|
s_sum[i] = 0.0f;
|
|
s_sum2[i] = 0.0f;
|
|
s_count[i] = 0;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
idx < num_pixels;
|
|
idx += blockDim.x * gridDim.x) {
|
|
uint16_t bin = pixel_to_bin[idx];
|
|
|
|
int32_t v = input_buffer[idx];
|
|
bool valid = (v != INT32_MIN) & (v != INT32_MAX);
|
|
|
|
if (bin < azint_bins && valid) {
|
|
const float val = static_cast<float>(v) * corrections[idx];
|
|
const float val2 = val * val;
|
|
atomicAdd(&s_sum[bin], val);
|
|
atomicAdd(&s_sum2[bin], val2);
|
|
atomicAdd(&s_count[bin], 1);
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
// Merge to global memory
|
|
for (unsigned int i = threadIdx.x; i < azint_bins; i += blockDim.x) {
|
|
atomicAdd(&azint_sum[i], s_sum[i]);
|
|
atomicAdd(&azint_sum2[i], s_sum2[i]);
|
|
atomicAdd(&azint_count[i], s_count[i]);
|
|
}
|
|
}
|
|
|
|
__global__
|
|
void gpu_azim(
|
|
const uint16_t *__restrict__ pixel_to_bin,
|
|
const float *__restrict__ corrections,
|
|
const int32_t *__restrict__ input_buffer,
|
|
float *__restrict__ azint_sum,
|
|
float *__restrict__ azint_sum2,
|
|
uint32_t *__restrict__ azint_count,
|
|
size_t num_pixels,
|
|
int azint_bins) {
|
|
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
idx < num_pixels;
|
|
idx += blockDim.x * gridDim.x) {
|
|
uint16_t bin = pixel_to_bin[idx];
|
|
|
|
int32_t v = input_buffer[idx];
|
|
bool valid = (v != INT32_MIN) & (v != INT32_MAX);
|
|
|
|
if (bin < azint_bins && valid) {
|
|
const float val = static_cast<float>(v) * corrections[idx];
|
|
const float val2 = val * val;
|
|
atomicAdd(&azint_sum[bin], val);
|
|
atomicAdd(&azint_sum2[bin], val2);
|
|
atomicAdd(&azint_count[bin], 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegrationMapping &integration, std::shared_ptr<CudaStream> stream)
|
|
: AzIntEngine(integration),
|
|
stream(stream),
|
|
gpu_azint_correction(npixel),
|
|
gpu_pixel_to_bin(npixel),
|
|
gpu_sum(azint_bins),
|
|
gpu_sum2(azint_bins),
|
|
gpu_count(azint_bins),
|
|
cpu_sum_reg(azint_sum),
|
|
cpu_sum2_reg(azint_sum2),
|
|
cpu_count_reg(azint_count) {
|
|
|
|
cudaDeviceProp prop{};
|
|
cudaGetDeviceProperties(&prop, 0);
|
|
|
|
threads = 128;
|
|
blocks = 4 * prop.multiProcessorCount;
|
|
shared_size = prop.sharedMemPerBlock;
|
|
shared_needed = azint_bins * (2 * sizeof(float) + sizeof(uint32_t));
|
|
|
|
cudaMemcpy(gpu_azint_correction, integration.Corrections().data(), sizeof(float) * npixel,
|
|
cudaMemcpyHostToDevice);
|
|
cudaMemcpy(gpu_pixel_to_bin, integration.GetPixelToBin().data(), sizeof(uint16_t) * npixel,
|
|
cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
void AzIntEngineGPU::Run(const ImagePreprocessorBuffer &image, AzimuthalIntegrationProfile &profile) {
|
|
if (image.size() != integration.GetPixelToBin().size())
|
|
throw std::runtime_error("ImageSpotFinder::AzimIntegration: Mismatch in size");
|
|
cuda_err(cudaMemsetAsync(gpu_sum, 0, sizeof(float) * azint_bins, *stream));
|
|
cuda_err(cudaMemsetAsync(gpu_sum2, 0, sizeof(float) * azint_bins, *stream));
|
|
cuda_err(cudaMemsetAsync(gpu_count, 0, sizeof(uint32_t) * azint_bins, *stream));
|
|
|
|
if (shared_needed < shared_size) {
|
|
gpu_azim_shared<<<blocks, threads, shared_needed, *stream>>>(
|
|
gpu_pixel_to_bin,gpu_azint_correction,image.getGPUBuffer(), gpu_sum, gpu_sum2,
|
|
gpu_count, npixel, azint_bins
|
|
);
|
|
} else {
|
|
gpu_azim<<<blocks, threads, 0, *stream>>>(
|
|
gpu_pixel_to_bin,gpu_azint_correction,image.getGPUBuffer(), gpu_sum, gpu_sum2,
|
|
gpu_count, npixel, azint_bins
|
|
);
|
|
}
|
|
|
|
cudaMemcpyAsync(azint_sum.data(), gpu_sum, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream);
|
|
cudaMemcpyAsync(azint_sum2.data(), gpu_sum2, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream);
|
|
cudaMemcpyAsync(azint_count.data(), gpu_count, sizeof(uint32_t) * azint_bins, cudaMemcpyDeviceToHost, *stream);
|
|
cuda_err(cudaStreamSynchronize(*stream));
|
|
|
|
profile.Clear(integration);
|
|
profile.Add(azint_sum, azint_count);
|
|
}
|