diff --git a/image_analysis/MXAnalysisWithoutFPGA.cpp b/image_analysis/MXAnalysisWithoutFPGA.cpp index f35d8d84..ceb02df2 100644 --- a/image_analysis/MXAnalysisWithoutFPGA.cpp +++ b/image_analysis/MXAnalysisWithoutFPGA.cpp @@ -7,7 +7,6 @@ #include "../compression/JFJochDecompress.h" #include "spot_finding/SpotUtils.h" -#include "spot_finding/ImageSpotFinderFactory.h" #include "bragg_prediction/BraggPredictionFactory.h" #include "image_preprocessing/ImagePreprocessorCPU.h" @@ -45,9 +44,10 @@ MXAnalysisWithoutFPGA::MXAnalysisWithoutFPGA(const DiffractionExperiment &in_exp preprocessor = std::make_unique(in_experiment, in_mask); #ifdef JFJOCH_USE_CUDA } else { - preprocessor = std::make_unique(in_experiment, in_mask); - spotFinder = std::make_unique(experiment.GetXPixelsNum(), experiment.GetYPixelsNum()); - azint = std::make_unique(integration); + auto stream = std::make_shared(); + preprocessor = std::make_unique(in_experiment, in_mask, stream); + spotFinder = std::make_unique(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(), stream); + azint = std::make_unique(integration, stream); } #endif } diff --git a/image_analysis/azint/AzIntEngineGPU.cu b/image_analysis/azint/AzIntEngineGPU.cu index 39aca31e..0fda4812 100644 --- a/image_analysis/azint/AzIntEngineGPU.cu +++ b/image_analysis/azint/AzIntEngineGPU.cu @@ -88,8 +88,9 @@ void gpu_azim( } } -AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegration &integration) +AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegration &integration, std::shared_ptr stream) : AzIntEngine(integration), + stream(stream), gpu_azint_correction(npixel), gpu_pixel_to_bin(npixel), gpu_sum(azint_bins), @@ -117,27 +118,27 @@ AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegration &integration) void AzIntEngineGPU::Run(const std::vector &image, AzimuthalIntegrationProfile &profile) { if (image.size() != integration.GetPixelToBin().size()) throw std::runtime_error("ImageSpotFinder::AzimIntegration: Mismatch in size"); - cuda_err(cudaMemsetAsync(gpu_sum, 0, sizeof(float) * azint_bins, stream)); - cuda_err(cudaMemsetAsync(gpu_sum2, 0, sizeof(float) * azint_bins, stream)); - cuda_err(cudaMemsetAsync(gpu_count, 0, sizeof(uint32_t) * azint_bins, stream)); - cudaMemcpyAsync(gpu_image, image.data(), sizeof(int32_t) * npixel, cudaMemcpyHostToDevice, stream); + cuda_err(cudaMemsetAsync(gpu_sum, 0, sizeof(float) * azint_bins, *stream)); + cuda_err(cudaMemsetAsync(gpu_sum2, 0, sizeof(float) * azint_bins, *stream)); + cuda_err(cudaMemsetAsync(gpu_count, 0, sizeof(uint32_t) * azint_bins, *stream)); + cudaMemcpyAsync(gpu_image, image.data(), sizeof(int32_t) * npixel, cudaMemcpyHostToDevice, *stream); if (shared_needed < shared_size) { - gpu_azim_shared<<>>( + gpu_azim_shared<<>>( gpu_pixel_to_bin,gpu_azint_correction,gpu_image, gpu_sum, gpu_sum2, gpu_count, npixel, azint_bins ); } else { - gpu_azim<<>>( + gpu_azim<<>>( gpu_pixel_to_bin,gpu_azint_correction,gpu_image, gpu_sum, gpu_sum2, gpu_count, npixel, azint_bins ); } - cudaMemcpyAsync(azint_sum.data(), gpu_sum, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(azint_sum2.data(), gpu_sum2, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(azint_count.data(), gpu_count, sizeof(uint32_t) * azint_bins, cudaMemcpyDeviceToHost, stream); - cuda_err(cudaStreamSynchronize(stream)); + cudaMemcpyAsync(azint_sum.data(), gpu_sum, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(azint_sum2.data(), gpu_sum2, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream); + cudaMemcpyAsync(azint_count.data(), gpu_count, sizeof(uint32_t) * azint_bins, cudaMemcpyDeviceToHost, *stream); + cuda_err(cudaStreamSynchronize(*stream)); profile.Clear(integration); profile.Add(azint_sum, azint_count); diff --git a/image_analysis/azint/AzIntEngineGPU.h b/image_analysis/azint/AzIntEngineGPU.h index 3ba5dad8..2ecf7d75 100644 --- a/image_analysis/azint/AzIntEngineGPU.h +++ b/image_analysis/azint/AzIntEngineGPU.h @@ -7,7 +7,7 @@ #include "../indexing/CUDAMemHelpers.h" class AzIntEngineGPU : public AzIntEngine { - CudaStream stream; + std::shared_ptr stream; int threads; int blocks; size_t shared_needed; @@ -25,6 +25,6 @@ class AzIntEngineGPU : public AzIntEngine { CudaDevicePtr gpu_image; public: - AzIntEngineGPU(const AzimuthalIntegration& integration); + AzIntEngineGPU(const AzimuthalIntegration& integration, std::shared_ptr stream); void Run(const std::vector &image, AzimuthalIntegrationProfile &profile) override; }; diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu index 280d13dd..d253270d 100644 --- a/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu +++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu @@ -50,19 +50,16 @@ __global__ void preprocess_kernel( // Output output[i] = - is_masked ? INT32_MIN : - is_sat ? INT32_MAX : - is_err ? INT32_MIN : - (int32_t)v; + is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v; // Counters - local_masked += is_masked; - local_saturated+= (!is_masked && is_sat); - local_error += (!is_masked && !is_sat && is_err); + local_masked += is_masked; + local_saturated += (!is_masked && is_sat); + local_error += (!is_masked && !is_sat && is_err); // Min/max only for valid if (valid) { - int64_t val = (int64_t)v; + int64_t val = (int64_t) v; if (val > local_max) local_max = val; if (val < local_min) local_min = val; } @@ -74,8 +71,8 @@ __global__ void preprocess_kernel( atomicAdd(&s_error, local_error); if (local_min <= local_max) { - atomicMax((long long*)&s_max, (long long)local_max); - atomicMin((long long*)&s_min, (long long)local_min); + atomicMax((long long *) &s_max, (long long) local_max); + atomicMin((long long *) &s_min, (long long) local_min); } __syncthreads(); @@ -86,22 +83,21 @@ __global__ void preprocess_kernel( atomicAdd(&stats->saturated_pixel_count, s_saturated); atomicAdd(&stats->error_pixel_count, s_error); - atomicMax((long long*)&stats->max_value, (long long)s_max); - atomicMin((long long*)&stats->min_value, (long long)s_min); + atomicMax((long long *) &stats->max_value, (long long) s_max); + atomicMin((long long *) &stats->min_value, (long long) s_min); } } -ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask) +ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask, + std::shared_ptr stream) : ImagePreprocessor(experiment), + stream(stream), gpu_mask(npixels), - gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal + gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal gpu_image(npixels), gpu_stats(1), cpu_stats(1), cpu_stats_reg(cpu_stats) { - - stream = std::make_shared(); - // Setup mask std::vector mask_vec(npixels); for (int i = 0; i < npixels; i++) @@ -142,7 +138,6 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector &processed_im const uint8_t *input, T err_value, T sat_value) { - if (sat_value > saturation_limit) sat_value = static_cast(saturation_limit); @@ -150,7 +145,7 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector &processed_im cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX}; cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream); - preprocess_kernel <<< blocks, threads, 0, *stream >>> ( + preprocess_kernel <<< blocks, threads, 0, *stream >>>( reinterpret_cast(gpu_decompressed_image.get()), gpu_mask, gpu_image, diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.h b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h index b877a1e5..ed8f0284 100644 --- a/image_analysis/image_preprocessing/ImagePreprocessorGPU.h +++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h @@ -22,7 +22,7 @@ class ImagePreprocessorGPU : public ImagePreprocessor { template ImageStatistics Analyze(std::vector &processed_image, const uint8_t *input, T err_value, T sat_value); public: - ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask); + ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask, std::shared_ptr stream); ImageStatistics Analyze(std::vector &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) override; const int32_t* GetImageDevicePtr() const; diff --git a/image_analysis/spot_finding/CMakeLists.txt b/image_analysis/spot_finding/CMakeLists.txt index abec1704..44bd6ca1 100644 --- a/image_analysis/spot_finding/CMakeLists.txt +++ b/image_analysis/spot_finding/CMakeLists.txt @@ -9,8 +9,6 @@ ADD_LIBRARY(JFJochSpotFinding STATIC DetModuleSpotFinder_cpu.h ImageSpotFinder.cpp ImageSpotFinder.h - ImageSpotFinderFactory.cpp - ImageSpotFinderFactory.h ) TARGET_LINK_LIBRARIES(JFJochSpotFinding JFJochCommon) diff --git a/image_analysis/spot_finding/ImageSpotFinderFactory.cpp b/image_analysis/spot_finding/ImageSpotFinderFactory.cpp deleted file mode 100644 index da7dce0e..00000000 --- a/image_analysis/spot_finding/ImageSpotFinderFactory.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute -// SPDX-License-Identifier: GPL-3.0-only - -#include "ImageSpotFinderFactory.h" -#include "ImageSpotFinderCPU.h" - -#ifdef JFJOCH_USE_CUDA -#include "../../common/CUDAWrapper.h" -#include "ImageSpotFinderGPU.h" -#endif - -std::unique_ptr CreateImageSpotFinder(size_t width, size_t height) { -#ifdef JFJOCH_USE_CUDA - if (get_gpu_count() > 0) - return std::make_unique(width, height); -#endif - return std::make_unique(width, height); -} diff --git a/image_analysis/spot_finding/ImageSpotFinderFactory.h b/image_analysis/spot_finding/ImageSpotFinderFactory.h deleted file mode 100644 index ef74eb5e..00000000 --- a/image_analysis/spot_finding/ImageSpotFinderFactory.h +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute -// SPDX-License-Identifier: GPL-3.0-only - -#ifndef JFJOCH_IMAGESPOTFINDERFACTORY_H -#define JFJOCH_IMAGESPOTFINDERFACTORY_H - -#include "ImageSpotFinder.h" - -std::unique_ptr CreateImageSpotFinder(size_t width, size_t height); - -#endif //JFJOCH_IMAGESPOTFINDERFACTORY_H \ No newline at end of file diff --git a/image_analysis/spot_finding/ImageSpotFinderGPU.cu b/image_analysis/spot_finding/ImageSpotFinderGPU.cu index ebc3fe19..199cc3ce 100644 --- a/image_analysis/spot_finding/ImageSpotFinderGPU.cu +++ b/image_analysis/spot_finding/ImageSpotFinderGPU.cu @@ -228,10 +228,11 @@ __global__ void analyze_pixel(const int32_t *in, uint32_t *prev_out, uint32_t *o } while (back < rmax); } -ImageSpotFinderGPU::ImageSpotFinderGPU(int32_t in_width, int32_t in_height) : - ImageSpotFinder(in_width, in_height), - input_buffer_reg(input_buffer), - output_buffer_reg(output_buffer) { +ImageSpotFinderGPU::ImageSpotFinderGPU(int32_t in_width, int32_t in_height, + std::shared_ptr stream) : stream(stream), + ImageSpotFinder(in_width, in_height), + input_buffer_reg(input_buffer), + output_buffer_reg(output_buffer) { gpu_in = CudaDevicePtr(width * height); gpu_out_0 = CudaDevicePtr(OutputSize()); gpu_out_1 = CudaDevicePtr(OutputSize()); @@ -264,16 +265,16 @@ std::vector ImageSpotFinderGPU::Run(const SpotFindingSettings & ) * numberOfCudaThreads; const dim3 blocks(nBlocks, numberOfWaves); - cuda_err(cudaMemcpyAsync(gpu_in, input_buffer.data(), width * height * sizeof(int32_t), cudaMemcpyHostToDevice, stream)); - cuda_err(cudaMemsetAsync(gpu_out_0, 0, OutputSize() * sizeof(uint32_t), stream)); - cuda_err(cudaMemsetAsync(gpu_out_1, 0, OutputSize() * sizeof(uint32_t), stream)); - analyze_pixel<<>> + cuda_err(cudaMemcpyAsync(gpu_in, input_buffer.data(), width * height * sizeof(int32_t), cudaMemcpyHostToDevice,* stream)); + cuda_err(cudaMemsetAsync(gpu_out_0, 0, OutputSize() * sizeof(uint32_t), *stream)); + cuda_err(cudaMemsetAsync(gpu_out_1, 0, OutputSize() * sizeof(uint32_t), *stream)); + analyze_pixel<<>> (gpu_in, gpu_out_1, gpu_out_0, spot_params); - analyze_pixel<<>> + analyze_pixel<<>> (gpu_in, gpu_out_0, gpu_out_1, spot_params); - cuda_err(cudaMemcpyAsync(output_buffer.data(), gpu_out_1, OutputSize() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + cuda_err(cudaMemcpyAsync(output_buffer.data(), gpu_out_1, OutputSize() * sizeof(uint32_t), cudaMemcpyDeviceToHost, *stream)); - cuda_err(cudaStreamSynchronize(stream)); + cuda_err(cudaStreamSynchronize(*stream)); return ExtractSpots(settings, res_mask); } diff --git a/image_analysis/spot_finding/ImageSpotFinderGPU.h b/image_analysis/spot_finding/ImageSpotFinderGPU.h index fc9ae801..9e703464 100644 --- a/image_analysis/spot_finding/ImageSpotFinderGPU.h +++ b/image_analysis/spot_finding/ImageSpotFinderGPU.h @@ -11,7 +11,7 @@ #include "../indexing/CUDAMemHelpers.h" class ImageSpotFinderGPU : public ImageSpotFinder { - CudaStream stream; + std::shared_ptr stream; CudaDevicePtr gpu_in; CudaDevicePtr gpu_out_0; @@ -24,7 +24,7 @@ class ImageSpotFinderGPU : public ImageSpotFinder { const int numberOfWaves = 32; // #waves that should work well for Nvidia L4 const int windowSizeLimit = 32; // limit on the window size (2nby+1, 2nbx+1) to prevent shared memory problems public: - ImageSpotFinderGPU(int32_t width, int32_t height); + ImageSpotFinderGPU(int32_t width, int32_t height, std::shared_ptr stream); ~ImageSpotFinderGPU() override = default; std::vector Run(const SpotFindingSettings &settings, const std::vector &res_mask) override; diff --git a/tests/ImageSpotFinderGPUTest.cpp b/tests/ImageSpotFinderGPUTest.cpp index d9afc88f..1860b7e1 100644 --- a/tests/ImageSpotFinderGPUTest.cpp +++ b/tests/ImageSpotFinderGPUTest.cpp @@ -23,7 +23,7 @@ static std::vector run_gpu_and_collect_spots(const std::vector< const SpotFindingSettings& settings, const std::vector& res_mask) { - ImageSpotFinderGPU gpu(static_cast(width), static_cast(height)); + ImageSpotFinderGPU gpu(static_cast(width), static_cast(height), std::make_shared()); REQUIRE(get_gpu_count() > 0); memcpy(gpu.GetInputBuffer().data(), input.data(), width * height * sizeof(int32_t));