diff --git a/image_analysis/MXAnalysisWithoutFPGA.cpp b/image_analysis/MXAnalysisWithoutFPGA.cpp
index 5a319f82..f35d8d84 100644
--- a/image_analysis/MXAnalysisWithoutFPGA.cpp
+++ b/image_analysis/MXAnalysisWithoutFPGA.cpp
@@ -9,13 +9,14 @@
 #include "spot_finding/SpotUtils.h"
 #include "spot_finding/ImageSpotFinderFactory.h"
 #include "bragg_prediction/BraggPredictionFactory.h"
-#include "image_preprocessing/ImagePreprocessor.h"
+#include "image_preprocessing/ImagePreprocessorCPU.h"
 
 #include "azint/AzIntEngineCPU.h"
 #include "spot_finding/ImageSpotFinderCPU.h"
 #ifdef JFJOCH_USE_CUDA
 #include "azint/AzIntEngineGPU.h"
 #include "spot_finding/ImageSpotFinderGPU.h"
+#include "image_preprocessing/ImagePreprocessorGPU.h"
 #include "../common/CUDAWrapper.h"
 #endif
 
@@ -34,15 +35,17 @@ MXAnalysisWithoutFPGA::MXAnalysisWithoutFPGA(const DiffractionExperiment &in_exp
       mask_resolution(experiment.GetPixelsNum(), false),
       mask_high_res(-1),
       mask_low_res(-1) {
-    preprocessor = std::make_unique<ImagePreprocessor>(in_experiment, in_mask);
+
 
 #ifdef JFJOCH_USE_CUDA
     if (get_gpu_count() == 0) {
 #endif
         spotFinder = std::make_unique<ImageSpotFinderCPU>(experiment.GetXPixelsNum(), experiment.GetYPixelsNum());
         azint = std::make_unique<AzIntEngineCPU>(integration);
+        preprocessor = std::make_unique<ImagePreprocessorCPU>(in_experiment, in_mask);
 #ifdef JFJOCH_USE_CUDA
     } else {
+        preprocessor = std::make_unique<ImagePreprocessorGPU>(in_experiment, in_mask);
         spotFinder = std::make_unique<ImageSpotFinderGPU>(experiment.GetXPixelsNum(), experiment.GetYPixelsNum());
         azint = std::make_unique<AzIntEngineGPU>(integration);
     }
diff --git a/image_analysis/image_preprocessing/CMakeLists.txt b/image_analysis/image_preprocessing/CMakeLists.txt
index 55a3e7cb..75a5515a 100644
--- a/image_analysis/image_preprocessing/CMakeLists.txt
+++ b/image_analysis/image_preprocessing/CMakeLists.txt
@@ -1,3 +1,11 @@
-ADD_LIBRARY(JFJochImagePreprocessing STATIC ImagePreprocessor.cpp ImagePreprocessor.h)
+ADD_LIBRARY(JFJochImagePreprocessing STATIC ImagePreprocessor.cpp ImagePreprocessor.h
+        ImagePreprocessorCPU.cpp
+        ImagePreprocessorCPU.h)
 
-TARGET_LINK_LIBRARIES(JFJochImagePreprocessing JFJochCommon)
\ No newline at end of file
+TARGET_LINK_LIBRARIES(JFJochImagePreprocessing JFJochCommon)
+
+IF (JFJOCH_CUDA_AVAILABLE)
+    TARGET_SOURCES(JFJochImagePreprocessing PRIVATE
+            ../indexing/CUDAMemHelpers.h
+            ImagePreprocessorGPU.cu ImagePreprocessorGPU.h)
+ENDIF()
\ No newline at end of file
diff --git a/image_analysis/image_preprocessing/ImagePreprocessor.cpp b/image_analysis/image_preprocessing/ImagePreprocessor.cpp
index 43427866..577dda15 100644
--- a/image_analysis/image_preprocessing/ImagePreprocessor.cpp
+++ b/image_analysis/image_preprocessing/ImagePreprocessor.cpp
@@ -3,67 +3,7 @@
 
 #include "ImagePreprocessor.h"
 
-ImagePreprocessor::ImagePreprocessor(const DiffractionExperiment &experiment,
-                                     const PixelMask &mask)
+ImagePreprocessor::ImagePreprocessor(const DiffractionExperiment &experiment)
     : npixels(experiment.GetPixelsNum()),
       experiment(experiment),
-      mask_1bit(npixels, false),
-      saturation_limit(experiment.GetSaturationLimit()) {
-
-    for (int i = 0; i < npixels; i++)
-        mask_1bit[i] = (mask.GetMask().at(i) != 0);
-}
-
-ImageStatistics ImagePreprocessor::Analyze(std::vector<int32_t> &processed_image, const uint8_t *image_ptr, CompressedImageMode image_mode) {
-    switch (image_mode) {
-        case CompressedImageMode::Int8:
-            return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
-        case CompressedImageMode::Int16:
-            return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
-        case CompressedImageMode::Int32:
-            return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
-        case CompressedImageMode::Uint8:
-            return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
-        case CompressedImageMode::Uint16:
-            return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
-        case CompressedImageMode::Uint32:
-            return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
-        default:
-            throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
-    }
-}
-
-template<class T>
-ImageStatistics ImagePreprocessor::Analyze(std::vector<int32_t> &processed_image, const uint8_t *input, T err_pixel_val, T sat_pixel_val) {
-
-    if (processed_image.size() != npixels)
-        throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Processed image size mismatch");
-
-    auto image = reinterpret_cast<const T *>(input);
-
-    ImageStatistics ret{};
-
-    if (sat_pixel_val > saturation_limit)
-        sat_pixel_val = static_cast<T>(saturation_limit);
-
-    for (int i = 0; i < npixels; i++) {
-        if (mask_1bit[i] != 0) {
-            processed_image[i] = INT32_MIN;
-            ++ret.masked_pixel_count;
-        } else if (image[i] >= sat_pixel_val) {
-            processed_image[i] = INT32_MAX;
-            ++ret.saturated_pixel_count;
-        } else if (std::is_signed<T>::value && (image[i] == err_pixel_val)) {
-            // Error pixels are possible only for signed types
-            processed_image[i] = INT32_MIN;
-            ++ret.error_pixel_count;
-        } else {
-            processed_image[i] = static_cast<int32_t>(image[i]);
-            if (image[i] > ret.max_value)
-                ret.max_value = image[i];
-            if (image[i] < ret.min_value)
-                ret.min_value = image[i];
-        }
-    }
-    return ret;
-}
+      saturation_limit(experiment.GetSaturationLimit()) {}
diff --git a/image_analysis/image_preprocessing/ImagePreprocessor.h b/image_analysis/image_preprocessing/ImagePreprocessor.h
index 3057ceff..7c2bbeb5 100644
--- a/image_analysis/image_preprocessing/ImagePreprocessor.h
+++ b/image_analysis/image_preprocessing/ImagePreprocessor.h
@@ -9,22 +9,21 @@
 #include "../common/DiffractionExperiment.h"
 #include "../common/PixelMask.h"
 
-struct ImageStatistics {
-    size_t error_pixel_count = 0;
-    size_t saturated_pixel_count = 0;
-    size_t masked_pixel_count = 0;
-    int64_t max_value = INT64_MIN;
-    int64_t min_value = INT64_MAX;
+struct alignas(8) ImageStatistics {
+    unsigned long long error_pixel_count = 0;
+    unsigned long long saturated_pixel_count = 0;
+    unsigned long long masked_pixel_count = 0;
+    long long max_value = INT64_MIN;
+    long long min_value = INT64_MAX;
 };
 
 class ImagePreprocessor {
 protected:
     const size_t npixels;
     const DiffractionExperiment &experiment;
-    std::vector<bool> mask_1bit;
     const int64_t saturation_limit;
-    template <class T> ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *input, T err_value, T sat_value);
 public:
-    ImagePreprocessor(const DiffractionExperiment &experiment, const PixelMask &mask);
-    ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode);
+    ImagePreprocessor(const DiffractionExperiment &experiment);
+    virtual ~ImagePreprocessor() = default;
+    virtual ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) = 0;
 };
diff --git a/image_analysis/image_preprocessing/ImagePreprocessorCPU.cpp b/image_analysis/image_preprocessing/ImagePreprocessorCPU.cpp
new file mode 100644
index 00000000..28ad9fe8
--- /dev/null
+++ b/image_analysis/image_preprocessing/ImagePreprocessorCPU.cpp
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
+// SPDX-License-Identifier: GPL-3.0-only
+
+#include "ImagePreprocessorCPU.h"
+
+ImagePreprocessorCPU::ImagePreprocessorCPU(const DiffractionExperiment &experiment, const PixelMask &mask)
+    : ImagePreprocessor(experiment),
+      mask_1bit(npixels, false) {
+    for (int i = 0; i < npixels; i++)
+        mask_1bit[i] = (mask.GetMask().at(i) != 0);
+}
+
+ImageStatistics ImagePreprocessorCPU::Analyze(std::vector<int32_t> &processed_image, const uint8_t *image_ptr, CompressedImageMode image_mode) {
+    switch (image_mode) {
+        case CompressedImageMode::Int8:
+            return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
+        case CompressedImageMode::Int16:
+            return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
+        case CompressedImageMode::Int32:
+            return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
+        case CompressedImageMode::Uint8:
+            return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
+        case CompressedImageMode::Uint16:
+            return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
+        case CompressedImageMode::Uint32:
+            return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
+        default:
+            throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
+    }
+}
+
+template<class T>
+ImageStatistics ImagePreprocessorCPU::Analyze(std::vector<int32_t> &processed_image, const uint8_t *input, T err_pixel_val, T sat_pixel_val) {
+
+    if (processed_image.size() != npixels)
+        throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Processed image size mismatch");
+
+    auto image = reinterpret_cast<const T *>(input);
+
+    ImageStatistics ret{};
+
+    if (sat_pixel_val > saturation_limit)
+        sat_pixel_val = static_cast<T>(saturation_limit);
+
+    for (int i = 0; i < npixels; i++) {
+        if (mask_1bit[i] != 0) {
+            processed_image[i] = INT32_MIN;
+            ++ret.masked_pixel_count;
+        } else if (image[i] >= sat_pixel_val) {
+            processed_image[i] = INT32_MAX;
+            ++ret.saturated_pixel_count;
+        } else if (std::is_signed<T>::value && (image[i] == err_pixel_val)) {
+            // Error pixels are possible only for signed types
+            processed_image[i] = INT32_MIN;
+            ++ret.error_pixel_count;
+        } else {
+            processed_image[i] = static_cast<int32_t>(image[i]);
+            if (image[i] > ret.max_value)
+                ret.max_value = image[i];
+            if (image[i] < ret.min_value)
+                ret.min_value = image[i];
+        }
+    }
+    return ret;
+}
diff --git a/image_analysis/image_preprocessing/ImagePreprocessorCPU.h b/image_analysis/image_preprocessing/ImagePreprocessorCPU.h
new file mode 100644
index 00000000..9f68b9b0
--- /dev/null
+++ b/image_analysis/image_preprocessing/ImagePreprocessorCPU.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
+// SPDX-License-Identifier: GPL-3.0-only
+
+#pragma once
+
+#include "ImagePreprocessor.h"
+
+class ImagePreprocessorCPU : public ImagePreprocessor {
+
+    std::vector<bool> mask_1bit;
+
+    template <class T> ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *input, T err_value, T sat_value);
+public:
+    ImagePreprocessorCPU(const DiffractionExperiment &experiment, const PixelMask &mask);
+    ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) override;
+};
diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu
new file mode 100644
index 00000000..280d13dd
--- /dev/null
+++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu
@@ -0,0 +1,175 @@
+// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
+// SPDX-License-Identifier: GPL-3.0-only
+
+#include "ImagePreprocessorGPU.h"
+
+template<class T>
+__global__ void preprocess_kernel(
+    const T *__restrict__ input,
+    const uint8_t *__restrict__ mask,
+    int32_t *__restrict__ output,
+    ImageStatistics *__restrict__ stats,
+    T saturation_limit,
+    T err_value,
+    int npixels) {
+    // Shared block accumulators
+    __shared__ unsigned long long s_masked;
+    __shared__ unsigned long long s_saturated;
+    __shared__ unsigned long long s_error;
+    __shared__ long long s_max;
+    __shared__ long long s_min;
+
+    if (threadIdx.x == 0) {
+        s_masked = 0;
+        s_saturated = 0;
+        s_error = 0;
+        s_max = INT64_MIN;
+        s_min = INT64_MAX;
+    }
+    __syncthreads();
+
+    // Thread-local accumulators
+    unsigned long long local_masked = 0;
+    unsigned long long local_saturated = 0;
+    unsigned long long local_error = 0;
+    long long local_max = INT64_MIN;
+    long long local_min = INT64_MAX;
+
+    constexpr bool is_signed = std::is_signed<T>::value;
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < npixels;
+         i += blockDim.x * gridDim.x) {
+        T v = input[i];
+
+        bool is_masked = mask[i];
+        bool is_sat = v >= saturation_limit;
+        bool is_err = is_signed && (v == err_value);
+
+        bool valid = !(is_masked || is_sat || is_err);
+
+        // Output
+        output[i] =
+            is_masked ? INT32_MIN :
+            is_sat    ? INT32_MAX :
+            is_err    ? INT32_MIN :
+                        (int32_t)v;
+
+        // Counters
+        local_masked   += is_masked;
+        local_saturated+= (!is_masked && is_sat);
+        local_error    += (!is_masked && !is_sat && is_err);
+
+        // Min/max only for valid
+        if (valid) {
+            int64_t val = (int64_t)v;
+            if (val > local_max) local_max = val;
+            if (val < local_min) local_min = val;
+        }
+    }
+
+    // Reduce to shared memory
+    atomicAdd(&s_masked, local_masked);
+    atomicAdd(&s_saturated, local_saturated);
+    atomicAdd(&s_error, local_error);
+
+    if (local_min <= local_max) {
+        atomicMax((long long*)&s_max, (long long)local_max);
+        atomicMin((long long*)&s_min, (long long)local_min);
+    }
+
+    __syncthreads();
+
+    // One thread writes block result
+    if (threadIdx.x == 0) {
+        atomicAdd(&stats->masked_pixel_count, s_masked);
+        atomicAdd(&stats->saturated_pixel_count, s_saturated);
+        atomicAdd(&stats->error_pixel_count, s_error);
+
+        atomicMax((long long*)&stats->max_value, (long long)s_max);
+        atomicMin((long long*)&stats->min_value, (long long)s_min);
+    }
+}
+
+ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
+    : ImagePreprocessor(experiment),
+      gpu_mask(npixels),
+      gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
+      gpu_image(npixels),
+      gpu_stats(1),
+      cpu_stats(1),
+      cpu_stats_reg(cpu_stats) {
+
+    stream = std::make_shared<CudaStream>();
+
+    // Setup mask
+    std::vector<uint8_t> mask_vec(npixels);
+    for (int i = 0; i < npixels; i++)
+        mask_vec[i] = (mask.GetMask().at(i) != 0);
+
+    cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice);
+
+    // Setup GPU settings
+    cudaDeviceProp prop{};
+    cudaGetDeviceProperties(&prop, 0);
+
+    threads = 128;
+    blocks = 4 * prop.multiProcessorCount;
+}
+
+ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image, const uint8_t *image_ptr,
+                                              CompressedImageMode image_mode) {
+    switch (image_mode) {
+        case CompressedImageMode::Int8:
+            return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
+        case CompressedImageMode::Int16:
+            return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
+        case CompressedImageMode::Int32:
+            return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
+        case CompressedImageMode::Uint8:
+            return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
+        case CompressedImageMode::Uint16:
+            return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
+        case CompressedImageMode::Uint32:
+            return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
+        default:
+            throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
+    }
+}
+
+template<class T>
+ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image,
+                                              const uint8_t *input,
+                                              T err_value,
+                                              T sat_value) {
+
+    if (sat_value > saturation_limit)
+        sat_value = static_cast<T>(saturation_limit);
+
+    cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice);
+
+    cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
+    cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
+    preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
+        reinterpret_cast<const T *>(gpu_decompressed_image.get()),
+        gpu_mask,
+        gpu_image,
+        gpu_stats,
+        sat_value,
+        err_value,
+        npixels);
+    cudaMemcpyAsync(processed_image.data(), gpu_image, npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream);
+    cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream);
+
+    cudaStreamSynchronize(*stream);
+
+    return cpu_stats[0];
+}
+
+const int32_t *ImagePreprocessorGPU::GetImageDevicePtr() const {
+    return gpu_image;
+}
+
+std::shared_ptr<CudaStream> ImagePreprocessorGPU::GetStream() const {
+    return stream;
+}
diff --git a/image_analysis/image_preprocessing/ImagePreprocessorGPU.h b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h
new file mode 100644
index 00000000..b877a1e5
--- /dev/null
+++ b/image_analysis/image_preprocessing/ImagePreprocessorGPU.h
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
+// SPDX-License-Identifier: GPL-3.0-only
+
+#pragma once
+
+#include "ImagePreprocessor.h"
+#include "../indexing/CUDAMemHelpers.h"
+
+class ImagePreprocessorGPU : public ImagePreprocessor {
+    std::shared_ptr<CudaStream> stream;
+    int threads;
+    int blocks;
+    CudaDevicePtr<uint8_t> gpu_mask;
+    CudaDevicePtr<uint8_t> gpu_decompressed_image;
+    CudaDevicePtr<int32_t> gpu_image;
+    CudaDevicePtr<ImageStatistics> gpu_stats;
+
+    std::vector<ImageStatistics> cpu_stats;
+    CudaRegisteredVector<ImageStatistics> cpu_stats_reg;
+
+    std::vector<int32_t> cpu_image;
+
+    template <class T> ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *input, T err_value, T sat_value);
+public:
+    ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask);
+    ImageStatistics Analyze(std::vector<int32_t> &processed_image, const uint8_t *decompressed_image, CompressedImageMode image_mode) override;
+
+    const int32_t* GetImageDevicePtr() const;
+    std::shared_ptr<CudaStream> GetStream() const;
+};
+