diff --git a/image_analysis/MXAnalysisWithoutFPGA.cpp b/image_analysis/MXAnalysisWithoutFPGA.cpp index 5a319f82..f35d8d84 100644 --- a/image_analysis/MXAnalysisWithoutFPGA.cpp +++ b/image_analysis/MXAnalysisWithoutFPGA.cpp @@ -9,13 +9,14 @@ #include "spot_finding/SpotUtils.h" #include "spot_finding/ImageSpotFinderFactory.h" #include "bragg_prediction/BraggPredictionFactory.h" -#include "image_preprocessing/ImagePreprocessor.h" +#include "image_preprocessing/ImagePreprocessorCPU.h" #include "azint/AzIntEngineCPU.h" #include "spot_finding/ImageSpotFinderCPU.h" #ifdef JFJOCH_USE_CUDA #include "azint/AzIntEngineGPU.h" #include "spot_finding/ImageSpotFinderGPU.h" +#include "image_preprocessing/ImagePreprocessorGPU.h" #include "../common/CUDAWrapper.h" #endif @@ -34,15 +35,17 @@ MXAnalysisWithoutFPGA::MXAnalysisWithoutFPGA(const DiffractionExperiment &in_exp mask_resolution(experiment.GetPixelsNum(), false), mask_high_res(-1), mask_low_res(-1) { - preprocessor = std::make_unique(in_experiment, in_mask); + #ifdef JFJOCH_USE_CUDA if (get_gpu_count() == 0) { #endif spotFinder = std::make_unique(experiment.GetXPixelsNum(), experiment.GetYPixelsNum()); azint = std::make_unique(integration); + preprocessor = std::make_unique(in_experiment, in_mask); #ifdef JFJOCH_USE_CUDA } else { + preprocessor = std::make_unique(in_experiment, in_mask); spotFinder = std::make_unique(experiment.GetXPixelsNum(), experiment.GetYPixelsNum()); azint = std::make_unique(integration); } diff --git a/image_analysis/image_preprocessing/CMakeLists.txt b/image_analysis/image_preprocessing/CMakeLists.txt index 55a3e7cb..75a5515a 100644 --- a/image_analysis/image_preprocessing/CMakeLists.txt +++ b/image_analysis/image_preprocessing/CMakeLists.txt @@ -1,3 +1,11 @@ -ADD_LIBRARY(JFJochImagePreprocessing STATIC ImagePreprocessor.cpp ImagePreprocessor.h) +ADD_LIBRARY(JFJochImagePreprocessing STATIC ImagePreprocessor.cpp ImagePreprocessor.h + ImagePreprocessorCPU.cpp + ImagePreprocessorCPU.h) -TARGET_LINK_LIBRARIES(JFJochImagePreprocessing JFJochCommon) \ No newline at end of file +TARGET_LINK_LIBRARIES(JFJochImagePreprocessing JFJochCommon) + +IF (JFJOCH_CUDA_AVAILABLE) + TARGET_SOURCES(JFJochImagePreprocessing PRIVATE + ../indexing/CUDAMemHelpers.h + ImagePreprocessorGPU.cu ImagePreprocessorGPU.h) +ENDIF() \ No newline at end of file diff --git a/image_analysis/image_preprocessing/ImagePreprocessor.cpp b/image_analysis/image_preprocessing/ImagePreprocessor.cpp index 43427866..577dda15 100644 --- a/image_analysis/image_preprocessing/ImagePreprocessor.cpp +++ b/image_analysis/image_preprocessing/ImagePreprocessor.cpp @@ -3,67 +3,7 @@ #include "ImagePreprocessor.h" -ImagePreprocessor::ImagePreprocessor(const DiffractionExperiment &experiment, - const PixelMask &mask) +ImagePreprocessor::ImagePreprocessor(const DiffractionExperiment &experiment) : npixels(experiment.GetPixelsNum()), experiment(experiment), - mask_1bit(npixels, false), - saturation_limit(experiment.GetSaturationLimit()) { - - for (int i = 0; i < npixels; i++) - mask_1bit[i] = (mask.GetMask().at(i) != 0); -} - -ImageStatistics ImagePreprocessor::Analyze(std::vector &processed_image, const uint8_t *image_ptr, CompressedImageMode image_mode) { - switch (image_mode) { - case CompressedImageMode::Int8: - return Analyze(processed_image, image_ptr, INT8_MIN, INT8_MAX); - case CompressedImageMode::Int16: - return Analyze(processed_image, image_ptr, INT16_MIN, INT16_MAX); - case CompressedImageMode::Int32: - return Analyze(processed_image, image_ptr, INT32_MIN, INT32_MAX); - case CompressedImageMode::Uint8: - return Analyze(processed_image, image_ptr, UINT8_MAX, UINT8_MAX); - case CompressedImageMode::Uint16: - return Analyze(processed_image, image_ptr, UINT16_MAX, UINT16_MAX); - case CompressedImageMode::Uint32: - return Analyze(processed_image, image_ptr, UINT32_MAX, UINT32_MAX); - default: - throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported"); - } -} - -template -ImageStatistics ImagePreprocessor::Analyze(std::vector &processed_image, const uint8_t *input, T err_pixel_val, T sat_pixel_val) { - - if (processed_image.size() != npixels) - throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Processed image size mismatch"); - - auto image = reinterpret_cast(input); - - ImageStatistics ret{}; - - if (sat_pixel_val > saturation_limit) - sat_pixel_val = static_cast(saturation_limit); - - for (int i = 0; i < npixels; i++) { - if (mask_1bit[i] != 0) { - processed_image[i] = INT32_MIN; - ++ret.masked_pixel_count; - } else if (image[i] >= sat_pixel_val) { - processed_image[i] = INT32_MAX; - ++ret.saturated_pixel_count; - } else if (std::is_signed::value && (image[i] == err_pixel_val)) { - // Error pixels are possible only for signed types - processed_image[i] = INT32_MIN; - ++ret.error_pixel_count; - } else { - processed_image[i] = static_cast(image[i]); - if (image[i] > ret.max_value) - ret.max_value = image[i]; - if (image[i] < ret.min_value) - ret.min_value = image[i]; - } - } - return ret; -} + saturation_limit(experiment.GetSaturationLimit()) {} diff --git a/image_analysis/image_preprocessing/ImagePreprocessor.h b/image_analysis/image_preprocessing/ImagePreprocessor.h index 3057ceff..7c2bbeb5 100644 --- a/image_analysis/image_preprocessing/ImagePreprocessor.h +++ b/image_analysis/image_preprocessing/ImagePreprocessor.h @@ -9,22 +9,21 @@ #include "../common/DiffractionExperiment.h" #include "../common/PixelMask.h" -struct ImageStatistics { - size_t error_pixel_count = 0; - size_t saturated_pixel_count = 0; - size_t masked_pixel_count = 0; - int64_t max_value = INT64_MIN; - int64_t min_value = INT64_MAX; +struct alignas(8) ImageStatistics { + unsigned long long error_pixel_count = 0; + unsigned long long saturated_pixel_count = 0; + unsigned long long masked_pixel_count = 0; + long long max_value = INT64_MIN; + long long min_value = INT64_MAX; }; class ImagePreprocessor { protected: const size_t npixels; const DiffractionExperiment &experiment; - std::vector mask_1bit; const int64_t saturation_limit; - template ImageStatistics Analyze(std::vector &processed_image, const uint8_t *input, T err_value, T sat_value); public: - ImagePreprocessor(const DiffractionExperiment &experiment, const PixelMask &mask); - ImageStatistics Analyze(std::vector &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode); + ImagePreprocessor(const DiffractionExperiment &experiment); + virtual ~ImagePreprocessor() = default; + virtual ImageStatistics Analyze(std::vector &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) = 0; }; diff --git a/image_analysis/image_preprocessing/ImagePreprocessorCPU.cpp b/image_analysis/image_preprocessing/ImagePreprocessorCPU.cpp new file mode 100644 index 00000000..28ad9fe8 --- /dev/null +++ b/image_analysis/image_preprocessing/ImagePreprocessorCPU.cpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-only + +#include "ImagePreprocessorCPU.h" + +ImagePreprocessorCPU::ImagePreprocessorCPU(const DiffractionExperiment &experiment, const PixelMask &mask) + : ImagePreprocessor(experiment), + mask_1bit(npixels, false) { + for (int i = 0; i < npixels; i++) + mask_1bit[i] = (mask.GetMask().at(i) != 0); +} + +ImageStatistics ImagePreprocessorCPU::Analyze(std::vector &processed_image, const uint8_t *image_ptr, CompressedImageMode image_mode) { + switch (image_mode) { + case CompressedImageMode::Int8: + return Analyze(processed_image, image_ptr, INT8_MIN, INT8_MAX); + case CompressedImageMode::Int16: + return Analyze(processed_image, image_ptr, INT16_MIN, INT16_MAX); + case CompressedImageMode::Int32: + return Analyze(processed_image, image_ptr, INT32_MIN, INT32_MAX); + case CompressedImageMode::Uint8: + return Analyze(processed_image, image_ptr, UINT8_MAX, UINT8_MAX); + case CompressedImageMode::Uint16: + return Analyze(processed_image, image_ptr, UINT16_MAX, UINT16_MAX); + case CompressedImageMode::Uint32: + return Analyze(processed_image, image_ptr, UINT32_MAX, UINT32_MAX); + default: + throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported"); + } +} + +template +ImageStatistics ImagePreprocessorCPU::Analyze(std::vector &processed_image, const uint8_t *input, T err_pixel_val, T sat_pixel_val) { + + if (processed_image.size() != npixels) + throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Processed image size mismatch"); + + auto image = reinterpret_cast(input); + + ImageStatistics ret{}; + + if (sat_pixel_val > saturation_limit) + sat_pixel_val = static_cast(saturation_limit); + + for (int i = 0; i < npixels; i++) { + if (mask_1bit[i] != 0) { + processed_image[i] = INT32_MIN; + ++ret.masked_pixel_count; + } else if (image[i] >= sat_pixel_val) { + processed_image[i] = INT32_MAX; + ++ret.saturated_pixel_count; + } else if (std::is_signed::value && (image[i] == err_pixel_val)) { + // Error pixels are possible only for signed types + processed_image[i] = INT32_MIN; + ++ret.error_pixel_count; + } else { + processed_image[i] = static_cast(image[i]); + if (image[i] > ret.max_value) + ret.max_value = image[i]; + if (image[i] < ret.min_value) + ret.min_value = image[i]; + } + } + return ret; +} diff --git a/image_analysis/image_preprocessing/ImagePreprocessorCPU.h b/image_analysis/image_preprocessing/ImagePreprocessorCPU.h new file mode 100644 index 00000000..9f68b9b0 --- /dev/null +++ b/image_analysis/image_preprocessing/ImagePreprocessorCPU.h @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-only + +#pragma once + +#include "ImagePreprocessor.h" + +class ImagePreprocessorCPU : public ImagePreprocessor { + + std::vector mask_1bit; + + template ImageStatistics Analyze(std::vector &processed_image, const uint8_t *input, T err_value, T sat_value); +public: + ImagePreprocessorCPU(const DiffractionExperiment &experiment, const PixelMask &mask); + ImageStatistics Analyze(std::vector &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) override; +}; diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu new file mode 100644 index 00000000..280d13dd --- /dev/null +++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu @@ -0,0 +1,175 @@ +// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-only + +#include "ImagePreprocessorGPU.h" + +template +__global__ void preprocess_kernel( + const T *__restrict__ input, + const uint8_t *__restrict__ mask, + int32_t *__restrict__ output, + ImageStatistics *__restrict__ stats, + T saturation_limit, + T err_value, + int npixels) { + // Shared block accumulators + __shared__ unsigned long long s_masked; + __shared__ unsigned long long s_saturated; + __shared__ unsigned long long s_error; + __shared__ long long s_max; + __shared__ long long s_min; + + if (threadIdx.x == 0) { + s_masked = 0; + s_saturated = 0; + s_error = 0; + s_max = INT64_MIN; + s_min = INT64_MAX; + } + __syncthreads(); + + // Thread-local accumulators + unsigned long long local_masked = 0; + unsigned long long local_saturated = 0; + unsigned long long local_error = 0; + long long local_max = INT64_MIN; + long long local_min = INT64_MAX; + + constexpr bool is_signed = std::is_signed::value; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < npixels; + i += blockDim.x * gridDim.x) { + T v = input[i]; + + bool is_masked = mask[i]; + bool is_sat = v >= saturation_limit; + bool is_err = is_signed && (v == err_value); + + bool valid = !(is_masked || is_sat || is_err); + + // Output + output[i] = + is_masked ? INT32_MIN : + is_sat ? INT32_MAX : + is_err ? INT32_MIN : + (int32_t)v; + + // Counters + local_masked += is_masked; + local_saturated+= (!is_masked && is_sat); + local_error += (!is_masked && !is_sat && is_err); + + // Min/max only for valid + if (valid) { + int64_t val = (int64_t)v; + if (val > local_max) local_max = val; + if (val < local_min) local_min = val; + } + } + + // Reduce to shared memory + atomicAdd(&s_masked, local_masked); + atomicAdd(&s_saturated, local_saturated); + atomicAdd(&s_error, local_error); + + if (local_min <= local_max) { + atomicMax((long long*)&s_max, (long long)local_max); + atomicMin((long long*)&s_min, (long long)local_min); + } + + __syncthreads(); + + // One thread writes block result + if (threadIdx.x == 0) { + atomicAdd(&stats->masked_pixel_count, s_masked); + atomicAdd(&stats->saturated_pixel_count, s_saturated); + atomicAdd(&stats->error_pixel_count, s_error); + + atomicMax((long long*)&stats->max_value, (long long)s_max); + atomicMin((long long*)&stats->min_value, (long long)s_min); + } +} + +ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask) + : ImagePreprocessor(experiment), + gpu_mask(npixels), + gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal + gpu_image(npixels), + gpu_stats(1), + cpu_stats(1), + cpu_stats_reg(cpu_stats) { + + stream = std::make_shared(); + + // Setup mask + std::vector mask_vec(npixels); + for (int i = 0; i < npixels; i++) + mask_vec[i] = (mask.GetMask().at(i) != 0); + + cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice); + + // Setup GPU settings + cudaDeviceProp prop{}; + cudaGetDeviceProperties(&prop, 0); + + threads = 128; + blocks = 4 * prop.multiProcessorCount; +} + +ImageStatistics ImagePreprocessorGPU::Analyze(std::vector &processed_image, const uint8_t *image_ptr, + CompressedImageMode image_mode) { + switch (image_mode) { + case CompressedImageMode::Int8: + return Analyze(processed_image, image_ptr, INT8_MIN, INT8_MAX); + case CompressedImageMode::Int16: + return Analyze(processed_image, image_ptr, INT16_MIN, INT16_MAX); + case CompressedImageMode::Int32: + return Analyze(processed_image, image_ptr, INT32_MIN, INT32_MAX); + case CompressedImageMode::Uint8: + return Analyze(processed_image, image_ptr, UINT8_MAX, UINT8_MAX); + case CompressedImageMode::Uint16: + return Analyze(processed_image, image_ptr, UINT16_MAX, UINT16_MAX); + case CompressedImageMode::Uint32: + return Analyze(processed_image, image_ptr, UINT32_MAX, UINT32_MAX); + default: + throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported"); + } +} + +template +ImageStatistics ImagePreprocessorGPU::Analyze(std::vector &processed_image, + const uint8_t *input, + T err_value, + T sat_value) { + + if (sat_value > saturation_limit) + sat_value = static_cast(saturation_limit); + + cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice); + + cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX}; + cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream); + preprocess_kernel <<< blocks, threads, 0, *stream >>> ( + reinterpret_cast(gpu_decompressed_image.get()), + gpu_mask, + gpu_image, + gpu_stats, + sat_value, + err_value, + npixels); + cudaMemcpyAsync(processed_image.data(), gpu_image, npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream); + + cudaStreamSynchronize(*stream); + + return cpu_stats[0]; +} + +const int32_t *ImagePreprocessorGPU::GetImageDevicePtr() const { + return gpu_image; +} + +std::shared_ptr ImagePreprocessorGPU::GetStream() const { + return stream; +} diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.h b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h new file mode 100644 index 00000000..b877a1e5 --- /dev/null +++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-only + +#pragma once + +#include "ImagePreprocessor.h" +#include "../indexing/CUDAMemHelpers.h" + +class ImagePreprocessorGPU : public ImagePreprocessor { + std::shared_ptr stream; + int threads; + int blocks; + CudaDevicePtr gpu_mask; + CudaDevicePtr gpu_decompressed_image; + CudaDevicePtr gpu_image; + CudaDevicePtr gpu_stats; + + std::vector cpu_stats; + CudaRegisteredVector cpu_stats_reg; + + std::vector cpu_image; + + template ImageStatistics Analyze(std::vector &processed_image, const uint8_t *input, T err_value, T sat_value); +public: + ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask); + ImageStatistics Analyze(std::vector &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) override; + + const int32_t* GetImageDevicePtr() const; + std::shared_ptr GetStream() const; +}; +