diff --git a/image_analysis/MXAnalysisWithoutFPGA.cpp b/image_analysis/MXAnalysisWithoutFPGA.cpp
index f35d8d84..ceb02df2 100644
--- a/image_analysis/MXAnalysisWithoutFPGA.cpp
+++ b/image_analysis/MXAnalysisWithoutFPGA.cpp
@@ -7,7 +7,6 @@
 #include "../compression/JFJochDecompress.h"
 
 #include "spot_finding/SpotUtils.h"
-#include "spot_finding/ImageSpotFinderFactory.h"
 #include "bragg_prediction/BraggPredictionFactory.h"
 #include "image_preprocessing/ImagePreprocessorCPU.h"
 
@@ -45,9 +44,10 @@ MXAnalysisWithoutFPGA::MXAnalysisWithoutFPGA(const DiffractionExperiment &in_exp
         preprocessor = std::make_unique<ImagePreprocessorCPU>(in_experiment, in_mask);
 #ifdef JFJOCH_USE_CUDA
     } else {
-        preprocessor = std::make_unique<ImagePreprocessorGPU>(in_experiment, in_mask);
-        spotFinder = std::make_unique<ImageSpotFinderGPU>(experiment.GetXPixelsNum(), experiment.GetYPixelsNum());
-        azint = std::make_unique<AzIntEngineGPU>(integration);
+        auto stream = std::make_shared<CudaStream>();
+        preprocessor = std::make_unique<ImagePreprocessorGPU>(in_experiment, in_mask, stream);
+        spotFinder = std::make_unique<ImageSpotFinderGPU>(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(), stream);
+        azint = std::make_unique<AzIntEngineGPU>(integration, stream);
     }
 #endif
 }
diff --git a/image_analysis/azint/AzIntEngineGPU.cu b/image_analysis/azint/AzIntEngineGPU.cu
index 39aca31e..0fda4812 100644
--- a/image_analysis/azint/AzIntEngineGPU.cu
+++ b/image_analysis/azint/AzIntEngineGPU.cu
@@ -88,8 +88,9 @@ void gpu_azim(
     }
 }
 
-AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegration &integration)
+AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegration &integration, std::shared_ptr<CudaStream> stream)
     : AzIntEngine(integration),
+      stream(stream),
       gpu_azint_correction(npixel),
       gpu_pixel_to_bin(npixel),
       gpu_sum(azint_bins),
@@ -117,27 +118,27 @@ AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegration &integration)
 void AzIntEngineGPU::Run(const std::vector<int32_t> &image, AzimuthalIntegrationProfile &profile) {
     if (image.size() != integration.GetPixelToBin().size())
         throw std::runtime_error("ImageSpotFinder::AzimIntegration: Mismatch in size");
-    cuda_err(cudaMemsetAsync(gpu_sum, 0, sizeof(float) * azint_bins, stream));
-    cuda_err(cudaMemsetAsync(gpu_sum2, 0, sizeof(float) * azint_bins, stream));
-    cuda_err(cudaMemsetAsync(gpu_count, 0, sizeof(uint32_t) * azint_bins, stream));
-    cudaMemcpyAsync(gpu_image, image.data(), sizeof(int32_t) * npixel, cudaMemcpyHostToDevice, stream);
+    cuda_err(cudaMemsetAsync(gpu_sum, 0, sizeof(float) * azint_bins, *stream));
+    cuda_err(cudaMemsetAsync(gpu_sum2, 0, sizeof(float) * azint_bins, *stream));
+    cuda_err(cudaMemsetAsync(gpu_count, 0, sizeof(uint32_t) * azint_bins, *stream));
+    cudaMemcpyAsync(gpu_image, image.data(), sizeof(int32_t) * npixel, cudaMemcpyHostToDevice, *stream);
 
     if (shared_needed < shared_size) {
-        gpu_azim_shared<<<blocks, threads, shared_needed, stream>>>(
+        gpu_azim_shared<<<blocks, threads, shared_needed, *stream>>>(
         gpu_pixel_to_bin,gpu_azint_correction,gpu_image, gpu_sum, gpu_sum2,
         gpu_count, npixel, azint_bins
         );
     } else {
-        gpu_azim<<<blocks, threads, 0, stream>>>(
+        gpu_azim<<<blocks, threads, 0, *stream>>>(
             gpu_pixel_to_bin,gpu_azint_correction,gpu_image, gpu_sum, gpu_sum2,
             gpu_count, npixel, azint_bins
         );
     }
 
-    cudaMemcpyAsync(azint_sum.data(), gpu_sum, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, stream);
-    cudaMemcpyAsync(azint_sum2.data(), gpu_sum2, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, stream);
-    cudaMemcpyAsync(azint_count.data(), gpu_count, sizeof(uint32_t) * azint_bins, cudaMemcpyDeviceToHost, stream);
-    cuda_err(cudaStreamSynchronize(stream));
+    cudaMemcpyAsync(azint_sum.data(), gpu_sum, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream);
+    cudaMemcpyAsync(azint_sum2.data(), gpu_sum2, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream);
+    cudaMemcpyAsync(azint_count.data(), gpu_count, sizeof(uint32_t) * azint_bins, cudaMemcpyDeviceToHost, *stream);
+    cuda_err(cudaStreamSynchronize(*stream));
 
     profile.Clear(integration);
     profile.Add(azint_sum, azint_count);
diff --git a/image_analysis/azint/AzIntEngineGPU.h b/image_analysis/azint/AzIntEngineGPU.h
index 3ba5dad8..2ecf7d75 100644
--- a/image_analysis/azint/AzIntEngineGPU.h
+++ b/image_analysis/azint/AzIntEngineGPU.h
@@ -7,7 +7,7 @@
 #include "../indexing/CUDAMemHelpers.h"
 
 class AzIntEngineGPU : public AzIntEngine {
-    CudaStream stream;
+    std::shared_ptr<CudaStream> stream;
     int threads;
     int blocks;
     size_t shared_needed;
@@ -25,6 +25,6 @@ class AzIntEngineGPU : public AzIntEngine {
 
     CudaDevicePtr<int32_t> gpu_image;
 public:
-    AzIntEngineGPU(const AzimuthalIntegration& integration);
+    AzIntEngineGPU(const AzimuthalIntegration& integration, std::shared_ptr<CudaStream> stream);
     void Run(const std::vector<int32_t> &image, AzimuthalIntegrationProfile &profile) override;
 };
diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu
index 280d13dd..d253270d 100644
--- a/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu
+++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu
@@ -50,19 +50,16 @@ __global__ void preprocess_kernel(
 
         // Output
         output[i] =
-            is_masked ? INT32_MIN :
-            is_sat    ? INT32_MAX :
-            is_err    ? INT32_MIN :
-                        (int32_t)v;
+                is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v;
 
         // Counters
-        local_masked   += is_masked;
-        local_saturated+= (!is_masked && is_sat);
-        local_error    += (!is_masked && !is_sat && is_err);
+        local_masked += is_masked;
+        local_saturated += (!is_masked && is_sat);
+        local_error += (!is_masked && !is_sat && is_err);
 
         // Min/max only for valid
         if (valid) {
-            int64_t val = (int64_t)v;
+            int64_t val = (int64_t) v;
             if (val > local_max) local_max = val;
             if (val < local_min) local_min = val;
         }
@@ -74,8 +71,8 @@ __global__ void preprocess_kernel(
     atomicAdd(&s_error, local_error);
 
     if (local_min <= local_max) {
-        atomicMax((long long*)&s_max, (long long)local_max);
-        atomicMin((long long*)&s_min, (long long)local_min);
+        atomicMax((long long *) &s_max, (long long) local_max);
+        atomicMin((long long *) &s_min, (long long) local_min);
     }
 
     __syncthreads();
@@ -86,22 +83,21 @@ __global__ void preprocess_kernel(
         atomicAdd(&stats->saturated_pixel_count, s_saturated);
         atomicAdd(&stats->error_pixel_count, s_error);
 
-        atomicMax((long long*)&stats->max_value, (long long)s_max);
-        atomicMin((long long*)&stats->min_value, (long long)s_min);
+        atomicMax((long long *) &stats->max_value, (long long) s_max);
+        atomicMin((long long *) &stats->min_value, (long long) s_min);
     }
 }
 
-ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
+ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask,
+                                           std::shared_ptr<CudaStream> stream)
     : ImagePreprocessor(experiment),
+      stream(stream),
       gpu_mask(npixels),
-      gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
+      gpu_decompressed_image(npixels * sizeof(uint32_t)),  // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
       gpu_image(npixels),
       gpu_stats(1),
       cpu_stats(1),
       cpu_stats_reg(cpu_stats) {
-
-    stream = std::make_shared<CudaStream>();
-
     // Setup mask
     std::vector<uint8_t> mask_vec(npixels);
     for (int i = 0; i < npixels; i++)
@@ -142,7 +138,6 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im
                                               const uint8_t *input,
                                               T err_value,
                                               T sat_value) {
-
     if (sat_value > saturation_limit)
         sat_value = static_cast<T>(saturation_limit);
 
@@ -150,7 +145,7 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im
 
     cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
     cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
-    preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
+    preprocess_kernel<T> <<< blocks, threads, 0, *stream >>>(
         reinterpret_cast<const T *>(gpu_decompressed_image.get()),
         gpu_mask,
         gpu_image,
diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.h b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h
index b877a1e5..ed8f0284 100644
--- a/image_analysis/image_preprocessing/ImagePreprocessorGPU.h
+++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h
@@ -22,7 +22,7 @@ class ImagePreprocessorGPU : public ImagePreprocessor {
 
     template <class T> ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *input, T err_value, T sat_value);
 public:
-    ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask);
+    ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask, std::shared_ptr<CudaStream> stream);
     ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) override;
 
     const int32_t* GetImageDevicePtr() const;
diff --git a/image_analysis/spot_finding/CMakeLists.txt b/image_analysis/spot_finding/CMakeLists.txt
index abec1704..44bd6ca1 100644
--- a/image_analysis/spot_finding/CMakeLists.txt
+++ b/image_analysis/spot_finding/CMakeLists.txt
@@ -9,8 +9,6 @@ ADD_LIBRARY(JFJochSpotFinding STATIC
         DetModuleSpotFinder_cpu.h
         ImageSpotFinder.cpp
         ImageSpotFinder.h
-        ImageSpotFinderFactory.cpp
-        ImageSpotFinderFactory.h
 )
 
 TARGET_LINK_LIBRARIES(JFJochSpotFinding JFJochCommon)
diff --git a/image_analysis/spot_finding/ImageSpotFinderFactory.cpp b/image_analysis/spot_finding/ImageSpotFinderFactory.cpp
deleted file mode 100644
index da7dce0e..00000000
--- a/image_analysis/spot_finding/ImageSpotFinderFactory.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
-// SPDX-License-Identifier: GPL-3.0-only
-
-#include "ImageSpotFinderFactory.h"
-#include "ImageSpotFinderCPU.h"
-
-#ifdef JFJOCH_USE_CUDA
-#include "../../common/CUDAWrapper.h"
-#include "ImageSpotFinderGPU.h"
-#endif
-
-std::unique_ptr<ImageSpotFinder> CreateImageSpotFinder(size_t width, size_t height) {
-#ifdef JFJOCH_USE_CUDA
-    if (get_gpu_count() > 0)
-        return std::make_unique<ImageSpotFinderGPU>(width, height);
-#endif
-    return std::make_unique<ImageSpotFinderCPU>(width, height);
-}
diff --git a/image_analysis/spot_finding/ImageSpotFinderFactory.h b/image_analysis/spot_finding/ImageSpotFinderFactory.h
deleted file mode 100644
index ef74eb5e..00000000
--- a/image_analysis/spot_finding/ImageSpotFinderFactory.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
-// SPDX-License-Identifier: GPL-3.0-only
-
-#ifndef JFJOCH_IMAGESPOTFINDERFACTORY_H
-#define JFJOCH_IMAGESPOTFINDERFACTORY_H
-
-#include "ImageSpotFinder.h"
-
-std::unique_ptr<ImageSpotFinder> CreateImageSpotFinder(size_t width, size_t height);
-
-#endif //JFJOCH_IMAGESPOTFINDERFACTORY_H
\ No newline at end of file
diff --git a/image_analysis/spot_finding/ImageSpotFinderGPU.cu b/image_analysis/spot_finding/ImageSpotFinderGPU.cu
index ebc3fe19..199cc3ce 100644
--- a/image_analysis/spot_finding/ImageSpotFinderGPU.cu
+++ b/image_analysis/spot_finding/ImageSpotFinderGPU.cu
@@ -228,10 +228,11 @@ __global__ void analyze_pixel(const int32_t *in, uint32_t *prev_out, uint32_t *o
     } while (back < rmax);
 }
 
-ImageSpotFinderGPU::ImageSpotFinderGPU(int32_t in_width, int32_t in_height) :
-        ImageSpotFinder(in_width, in_height),
-        input_buffer_reg(input_buffer),
-        output_buffer_reg(output_buffer) {
+ImageSpotFinderGPU::ImageSpotFinderGPU(int32_t in_width, int32_t in_height,
+                                       std::shared_ptr<CudaStream> stream) : stream(stream),
+                                                                             ImageSpotFinder(in_width, in_height),
+                                                                             input_buffer_reg(input_buffer),
+                                                                             output_buffer_reg(output_buffer) {
     gpu_in = CudaDevicePtr<int32_t>(width * height);
     gpu_out_0 = CudaDevicePtr<uint32_t>(OutputSize());
     gpu_out_1 = CudaDevicePtr<uint32_t>(OutputSize());
@@ -264,16 +265,16 @@ std::vector<DiffractionSpot> ImageSpotFinderGPU::Run(const SpotFindingSettings &
                             ) * numberOfCudaThreads;
     const dim3 blocks(nBlocks, numberOfWaves);
 
-    cuda_err(cudaMemcpyAsync(gpu_in, input_buffer.data(), width * height * sizeof(int32_t), cudaMemcpyHostToDevice, stream));
-    cuda_err(cudaMemsetAsync(gpu_out_0, 0, OutputSize() * sizeof(uint32_t), stream));
-    cuda_err(cudaMemsetAsync(gpu_out_1, 0, OutputSize() * sizeof(uint32_t), stream));
-    analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, stream>>>
+    cuda_err(cudaMemcpyAsync(gpu_in, input_buffer.data(), width * height * sizeof(int32_t), cudaMemcpyHostToDevice,* stream));
+    cuda_err(cudaMemsetAsync(gpu_out_0, 0, OutputSize() * sizeof(uint32_t), *stream));
+    cuda_err(cudaMemsetAsync(gpu_out_1, 0, OutputSize() * sizeof(uint32_t), *stream));
+    analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, *stream>>>
             (gpu_in, gpu_out_1, gpu_out_0, spot_params);
-    analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, stream>>>
+    analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, *stream>>>
             (gpu_in, gpu_out_0, gpu_out_1, spot_params);
-    cuda_err(cudaMemcpyAsync(output_buffer.data(), gpu_out_1, OutputSize() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+    cuda_err(cudaMemcpyAsync(output_buffer.data(), gpu_out_1, OutputSize() * sizeof(uint32_t), cudaMemcpyDeviceToHost, *stream));
 
-    cuda_err(cudaStreamSynchronize(stream));
+    cuda_err(cudaStreamSynchronize(*stream));
 
     return ExtractSpots(settings, res_mask);
 }
diff --git a/image_analysis/spot_finding/ImageSpotFinderGPU.h b/image_analysis/spot_finding/ImageSpotFinderGPU.h
index fc9ae801..9e703464 100644
--- a/image_analysis/spot_finding/ImageSpotFinderGPU.h
+++ b/image_analysis/spot_finding/ImageSpotFinderGPU.h
@@ -11,7 +11,7 @@
 #include "../indexing/CUDAMemHelpers.h"
 
 class ImageSpotFinderGPU : public ImageSpotFinder {
-    CudaStream stream;
+    std::shared_ptr<CudaStream> stream;
 
     CudaDevicePtr<int32_t> gpu_in;
     CudaDevicePtr<uint32_t> gpu_out_0;
@@ -24,7 +24,7 @@ class ImageSpotFinderGPU : public ImageSpotFinder {
     const int numberOfWaves = 32;           // #waves that should work well for Nvidia L4
     const int windowSizeLimit = 32;         // limit on the window size (2nby+1, 2nbx+1) to prevent shared memory problems
 public:
-    ImageSpotFinderGPU(int32_t width, int32_t height);
+    ImageSpotFinderGPU(int32_t width, int32_t height, std::shared_ptr<CudaStream> stream);
     ~ImageSpotFinderGPU() override = default;
 
     std::vector<DiffractionSpot> Run(const SpotFindingSettings &settings, const std::vector<bool> &res_mask) override;
diff --git a/tests/ImageSpotFinderGPUTest.cpp b/tests/ImageSpotFinderGPUTest.cpp
index d9afc88f..1860b7e1 100644
--- a/tests/ImageSpotFinderGPUTest.cpp
+++ b/tests/ImageSpotFinderGPUTest.cpp
@@ -23,7 +23,7 @@ static std::vector<DiffractionSpot> run_gpu_and_collect_spots(const std::vector<
                                                               const SpotFindingSettings& settings,
                                                               const std::vector<bool>& res_mask)
 {
-    ImageSpotFinderGPU gpu(static_cast<int32_t>(width), static_cast<int32_t>(height));
+    ImageSpotFinderGPU gpu(static_cast<int32_t>(width), static_cast<int32_t>(height), std::make_shared<CudaStream>());
     REQUIRE(get_gpu_count() > 0);
 
     memcpy(gpu.GetInputBuffer().data(), input.data(), width * height * sizeof(int32_t));