MXAnalysisWithoutFPGA: Better synchronize preprocessing, azint and spot finding on GPU
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 12m40s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 14m26s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m21s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m54s
Build Packages / build:rpm (rocky8) (push) Successful in 17m54s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m58s
Build Packages / build:rpm (rocky9) (push) Successful in 18m35s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 18m44s
Build Packages / Generate python client (push) Successful in 1m28s
Build Packages / Build documentation (push) Successful in 1m59s
Build Packages / Create release (push) Has been skipped
Build Packages / build:rpm (ubuntu2204) (push) Successful in 10m9s
Build Packages / XDS test (durin plugin) (push) Successful in 9m19s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m0s
Build Packages / XDS test (neggia plugin) (push) Successful in 8m47s
Build Packages / DIALS test (push) Successful in 12m36s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 9m57s
Build Packages / Unit tests (push) Successful in 1h1m21s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 12m40s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 14m26s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m21s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m54s
Build Packages / build:rpm (rocky8) (push) Successful in 17m54s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m58s
Build Packages / build:rpm (rocky9) (push) Successful in 18m35s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 18m44s
Build Packages / Generate python client (push) Successful in 1m28s
Build Packages / Build documentation (push) Successful in 1m59s
Build Packages / Create release (push) Has been skipped
Build Packages / build:rpm (ubuntu2204) (push) Successful in 10m9s
Build Packages / XDS test (durin plugin) (push) Successful in 9m19s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m0s
Build Packages / XDS test (neggia plugin) (push) Successful in 8m47s
Build Packages / DIALS test (push) Successful in 12m36s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 9m57s
Build Packages / Unit tests (push) Successful in 1h1m21s
This commit is contained in:
@@ -50,19 +50,16 @@ __global__ void preprocess_kernel(
|
||||
|
||||
// Output
|
||||
output[i] =
|
||||
is_masked ? INT32_MIN :
|
||||
is_sat ? INT32_MAX :
|
||||
is_err ? INT32_MIN :
|
||||
(int32_t)v;
|
||||
is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v;
|
||||
|
||||
// Counters
|
||||
local_masked += is_masked;
|
||||
local_saturated+= (!is_masked && is_sat);
|
||||
local_error += (!is_masked && !is_sat && is_err);
|
||||
local_masked += is_masked;
|
||||
local_saturated += (!is_masked && is_sat);
|
||||
local_error += (!is_masked && !is_sat && is_err);
|
||||
|
||||
// Min/max only for valid
|
||||
if (valid) {
|
||||
int64_t val = (int64_t)v;
|
||||
int64_t val = (int64_t) v;
|
||||
if (val > local_max) local_max = val;
|
||||
if (val < local_min) local_min = val;
|
||||
}
|
||||
@@ -74,8 +71,8 @@ __global__ void preprocess_kernel(
|
||||
atomicAdd(&s_error, local_error);
|
||||
|
||||
if (local_min <= local_max) {
|
||||
atomicMax((long long*)&s_max, (long long)local_max);
|
||||
atomicMin((long long*)&s_min, (long long)local_min);
|
||||
atomicMax((long long *) &s_max, (long long) local_max);
|
||||
atomicMin((long long *) &s_min, (long long) local_min);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -86,22 +83,21 @@ __global__ void preprocess_kernel(
|
||||
atomicAdd(&stats->saturated_pixel_count, s_saturated);
|
||||
atomicAdd(&stats->error_pixel_count, s_error);
|
||||
|
||||
atomicMax((long long*)&stats->max_value, (long long)s_max);
|
||||
atomicMin((long long*)&stats->min_value, (long long)s_min);
|
||||
atomicMax((long long *) &stats->max_value, (long long) s_max);
|
||||
atomicMin((long long *) &stats->min_value, (long long) s_min);
|
||||
}
|
||||
}
|
||||
|
||||
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
|
||||
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask,
|
||||
std::shared_ptr<CudaStream> stream)
|
||||
: ImagePreprocessor(experiment),
|
||||
stream(stream),
|
||||
gpu_mask(npixels),
|
||||
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
|
||||
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
|
||||
gpu_image(npixels),
|
||||
gpu_stats(1),
|
||||
cpu_stats(1),
|
||||
cpu_stats_reg(cpu_stats) {
|
||||
|
||||
stream = std::make_shared<CudaStream>();
|
||||
|
||||
// Setup mask
|
||||
std::vector<uint8_t> mask_vec(npixels);
|
||||
for (int i = 0; i < npixels; i++)
|
||||
@@ -142,7 +138,6 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im
|
||||
const uint8_t *input,
|
||||
T err_value,
|
||||
T sat_value) {
|
||||
|
||||
if (sat_value > saturation_limit)
|
||||
sat_value = static_cast<T>(saturation_limit);
|
||||
|
||||
@@ -150,7 +145,7 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im
|
||||
|
||||
cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
|
||||
cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
|
||||
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
|
||||
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>>(
|
||||
reinterpret_cast<const T *>(gpu_decompressed_image.get()),
|
||||
gpu_mask,
|
||||
gpu_image,
|
||||
|
||||
Reference in New Issue
Block a user