diff --git a/image_analysis/GPUImageAnalysis.cpp b/image_analysis/GPUImageAnalysis.cpp
index ae4487a9..07e0d335 100644
--- a/image_analysis/GPUImageAnalysis.cpp
+++ b/image_analysis/GPUImageAnalysis.cpp
@@ -8,8 +8,8 @@ struct CudaStreamWrapper {
     int32_t just_anything_as_this_wont_be_used;
 };
 
-GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels, const std::vector<uint8_t> &mask) :
-        xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr), rad_integration_nbins(0) {
+GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels) :
+        xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr) {
 }
 
 GPUImageAnalysis::~GPUImageAnalysis() {}
@@ -32,6 +32,6 @@ void GPUImageAnalysis::RegisterBuffer() {}
 
 void GPUImageAnalysis::UnregisterBuffer() {}
 
-void GPUImageAnalysis::LoadDataToGPU(bool apply_pixel_mask_on_gpu) {}
+void GPUImageAnalysis::LoadDataToGPU() {}
 
 #endif
diff --git a/image_analysis/GPUImageAnalysis.cu b/image_analysis/GPUImageAnalysis.cu
index 0157b90e..37c3b71a 100644
--- a/image_analysis/GPUImageAnalysis.cu
+++ b/image_analysis/GPUImageAnalysis.cu
@@ -1,14 +1,12 @@
 // Copyright (2019-2023) Paul Scherrer Institute
-
 #include "GPUImageAnalysis.h"
-#include "../common/JFJochException.h"
-#include "../common/DiffractionGeometry.h"
 
-#include <sstream>
-#include "../common/CUDAWrapper.h"
+#include "../common/JFJochException.h"
 
 // input      X x Y pixels array
-// output     X x Y byte array
+// output     X x Y bit array
+
+static constexpr int WARP_SIZE = 32;    // assume warp size of 32 cuda threads per warp
 
 struct CudaStreamWrapper {
     cudaStream_t v;
@@ -19,6 +17,60 @@ inline void cuda_err(cudaError_t val) {
         throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
 }
 
+// Calculate byte size of output
+// output_type: type of output array elements
+// ptr: dummy pointer to make template type inference possible
+// xpixels: number of bits (=pixel results) in x direction
+// ypixels: number of bits (=pixel results) in y direction
+// returns the size of the output (xpixels X ypixels bits) in bytes for an array with elements of type output_type
+template <typename output_type>
+static std::size_t output_byte_size(const output_type* ptr, int32_t xpixels, int32_t ypixels) noexcept {
+    constexpr auto bit_group_size = sizeof(output_type) * 8;    // number of bits per output array element
+    return ((xpixels * ypixels + bit_group_size - 1) / bit_group_size) * sizeof(output_type);
+}
+
+// Get the bit ptr[line, col] of a bit array with lines of size xpixels bits
+// output_type: type of bit array elements
+// ptr: pointer to array with bits stored in element type output_type
+// xpixels: size in bits (=pixel results) of one line (=row of array)
+// line: bit line (=row) number
+// col: bit column number
+// returns the bit ptr[line, col]
+template <typename output_type>
+static bool get_bit(const output_type* ptr, int32_t xpixels, int32_t line, int32_t col) noexcept {
+    constexpr auto bit_group_size = sizeof(output_type) * 8;    // number of bits per output array element
+    const auto global_bit_idx = line * xpixels + col;
+    const auto element_idx = global_bit_idx / bit_group_size;
+    const auto local_bit_idx = global_bit_idx % bit_group_size;
+    return (ptr[element_idx] & (1 << local_bit_idx)) != 0;
+}
+
+// Write pixel results to bit array
+// params: spot finding parameters
+// out: pixel result bit array
+// pixel: flat pixel index = bit index into bit array
+// val: pixel result
+// **NOTE**: assumes sizeof(*out) * 8 == WARP_SIZE
+__device__ __forceinline__ void write_result(const spot_parameters& params, uint32_t* out, int32_t pixel, uint8_t val)
+{
+    static_assert(sizeof(*out) * 8 == WARP_SIZE, "Violation of essential implementation assumtion: WARP_SIZE must match output array element type bit size!");
+    static constexpr unsigned ALL_THREADS = unsigned{-1};
+    const int32_t laneid = threadIdx.x & (WARP_SIZE - 1);
+    unsigned result = __ballot_sync(ALL_THREADS, val);
+    const int32_t idx = pixel / WARP_SIZE;      // global uint32_t index
+    const int32_t bit = pixel % WARP_SIZE;      // local bit index
+
+    if ((bit >= laneid) & (laneid == 0)) {      // write to upper part of uint32_t
+        result <<= bit;
+        if (result)
+            atomicOr(&out[idx], result);
+    } else if ((bit < laneid) & (bit == 0)) {   // write to lower part of uint32_t
+        result >>= laneid;
+        if (result)
+            atomicOr(&out[idx], result);
+    }
+}
+
 // Determine if pixel could be a spot
 // params: spot finding parameters
 // val: pixel value
@@ -44,7 +96,7 @@ __device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, c
 
 // Find pixels that could be spots
 // in: image input values
-// out: pixel result byte array, 1 byte per pixel (0:no/1:candidate spot)
+// out: pixel result bit array, 1 bit per pixel (0:no/1:candidate spot)
 // params: spot finding parameters
 //
 // The algorithm uses multiple waves (blockDim.y) that run over sections of rows.
@@ -55,7 +107,7 @@ __device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, c
 // Every cuda thread is associated with a particular column. The thread maintains
 // the sum/sum2/count values in shared memory for it's column. To do this, the input
 // pixel values for the hight of the aggregation window are saved in shared memory.
-__global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parameters params)
+__global__ void analyze_pixel(const int16_t *in, uint32_t *out, const spot_parameters params)
 {
     // assumption: 2 * params.nby + 1 <= params.rows and 2 * params.nbx + 1 <= params.cols
     const int32_t window = 2 * (int)params.nby + 1;                                         // vertical window
@@ -111,6 +163,7 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame
     // wave front up to rmax
     do {
         __syncthreads();    // make others see the shared values
+        uint8_t val = 0;
         if (result_col) {   // write at the back end of the wave
             total_sum = total_sum2 = total_count = 0;
             for (auto j = left; j < right; j++) {
@@ -118,8 +171,9 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame
                 total_sum2 += shared_sum2[j];
                 total_count += shared_count[j];
             }
-            out[back * params.cols + col] =  pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
+            val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
         }
+        write_result(params, out, back * params.cols + col, val);
         back++;
         __syncthreads();    // keep shared values until others have seen them
         if (data_col) {     // read at the front end of the wave
@@ -144,6 +198,7 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame
     // wave back up to rmax
     do {
         __syncthreads();    // make others see the shared values
+        uint8_t val = 0;
         if (result_col) {   // write at the back end of the wave
             total_sum = total_sum2 = total_count = 0;
             for (auto j = left; j < right; j++) {
@@ -151,8 +206,9 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame
                 total_sum2 += shared_sum2[j];
                 total_count += shared_count[j];
             }
-            out[back * params.cols + col] = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
+            val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
         }
+        write_result(params, out, back * params.cols + col, val);
         back++;
         __syncthreads();    // keep shared values until others have seen them
         if (data_col) {     // read at the front end of the wave if possible
@@ -178,30 +234,13 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame
     } while (back < rmax);
 }
 
-__global__ void spot_finder_reduce(uint32_t *output, uint32_t* counter, const uint8_t* input,
-                                   uint32_t npixel, uint32_t output_size) {
-    uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
-    if (idx < npixel) {
-        if (input[idx]) {
-            auto old_counter = atomicAdd(counter, 1);
-            if (old_counter < output_size)
-                output[old_counter] = idx;
-        }
-    }
-}
+GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels) :
+        xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr) {
 
-__global__ void apply_pixel_mask(int16_t *image, const uint8_t *mask, uint32_t npixel) {
-    uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
-    if (idx < npixel) {
-        if (mask[idx] == 0)
-            image[idx] = INT16_MIN;
-    }
-}
+    int device_count;
+    cuda_err(cudaGetDeviceCount(&device_count));
 
-GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels, const std::vector<uint8_t> &mask) :
-        xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr), numberOfSMs(1) {
-
-    if (get_gpu_count() == 0)
+    if (device_count == 0)
         throw JFJochException(JFJochExceptionCategory::GPUCUDAError, "No CUDA devices found");
 
     int deviceId;
@@ -215,32 +254,18 @@ GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels, const
     cudastream = new(CudaStreamWrapper);
 
     cuda_err(cudaStreamCreate(&cudastream->v));
-    cuda_err(cudaMalloc(&gpu_mask, xpixels * ypixels * sizeof(int8_t)));
     cuda_err(cudaMalloc(&gpu_in, xpixels * ypixels * sizeof(int16_t)));
-    cuda_err(cudaMalloc(&gpu_out, xpixels * ypixels * sizeof(int8_t)));
-    cuda_err(cudaMalloc(&gpu_out_reduced, maxStrongPixel * sizeof(uint32_t)));
-    cuda_err(cudaMalloc(&gpu_out_reduced_counter, sizeof(uint32_t)));
-    cuda_err(cudaHostAlloc(&host_out_reduced, maxStrongPixel * sizeof(uint32_t), cudaHostAllocPortable));
-    cuda_err(cudaHostAlloc(&host_out_reduced_counter, sizeof(uint32_t), cudaHostAllocPortable));
-
-    cuda_err(cudaMemsetAsync(gpu_mask, 1, xpixels*ypixels, cudastream->v));
-
-    if (mask.size() != xpixels * ypixels)
-        throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Mismatch in mask size");
-    cudaMemcpy(gpu_mask, mask.data(), xpixels*ypixels, cudaMemcpyHostToDevice);
+    cuda_err(cudaMalloc(&gpu_out, output_byte_size(gpu_out, xpixels, ypixels)));
+    cuda_err(cudaHostAlloc(&host_out, output_byte_size(host_out, xpixels, ypixels), cudaHostAllocPortable));
 }
 
 GPUImageAnalysis::~GPUImageAnalysis() {
     cudaStreamDestroy(cudastream->v);
     delete(cudastream);
 
-    cudaFreeHost(host_out_reduced);
-    cudaFreeHost(host_out_reduced_counter);
-    cudaFree(gpu_mask);
+    cudaFreeHost(host_out);
     cudaFree(gpu_in);
     cudaFree(gpu_out);
-    cudaFree(gpu_out_reduced);
-    cudaFree(gpu_out_reduced_counter);
 }
 
 void GPUImageAnalysis::SetInputBuffer(void *ptr) {
@@ -287,23 +312,12 @@ void GPUImageAnalysis::RunSpotFinder(const JFJochProtoBuf::DataProcessingSetting
                                 ) * numberOfCudaThreads;
         const dim3 blocks(nBlocks, numberOfWaves);
 
-        cuda_err(cudaMemsetAsync(gpu_out, 0, xpixels * ypixels * sizeof(uint8_t), cudastream->v));
+        cuda_err(cudaMemsetAsync(gpu_out, 0, output_byte_size(gpu_out, xpixels, ypixels), cudastream->v));
         analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, cudastream->v>>>
                 (gpu_in, gpu_out, spot_params);
     }
 
-    {
-        cuda_err(cudaMemsetAsync(gpu_out_reduced_counter, 0, sizeof(uint32_t), cudastream->v));
-        const auto nblocks =
-                xpixels * ypixels / numberOfCudaThreads + ((xpixels * ypixels % numberOfCudaThreads == 0) ? 0 : 1);
-
-        spot_finder_reduce<<<nblocks, numberOfCudaThreads, 0, cudastream->v>>>(gpu_out_reduced, gpu_out_reduced_counter,
-                                                                               gpu_out, xpixels*ypixels, maxStrongPixel);
-    }
-    cuda_err(cudaMemcpyAsync(host_out_reduced, gpu_out_reduced, maxStrongPixel * sizeof(uint32_t),
-                             cudaMemcpyDeviceToHost,cudastream->v));
-
-    cuda_err(cudaMemcpyAsync(host_out_reduced_counter, gpu_out_reduced_counter, sizeof(uint32_t),
+    cuda_err(cudaMemcpyAsync(host_out, gpu_out, output_byte_size(gpu_out, xpixels, ypixels),
                              cudaMemcpyDeviceToHost,cudastream->v));
 }
 
@@ -313,17 +327,26 @@ void GPUImageAnalysis::GetSpotFinderResults(StrongPixelSet &pixel_set) {
 
     cuda_err(cudaStreamSynchronize(cudastream->v));
 
-    for (int i = 0; i < std::min<uint32_t>(maxStrongPixel, *host_out_reduced_counter); i++) {
-        size_t npixel = host_out_reduced[i];
-        size_t line = npixel / xpixels;
-        size_t col = npixel % xpixels;
-        pixel_set.AddStrongPixel(col, line, host_in[npixel]);
+    auto out_size = output_byte_size(host_out, xpixels, ypixels);
+    auto out_ptr = (uint32_t *) host_out;
+
+    for (int i = 0; i < out_size / sizeof(out_ptr[0]); i++) {
+        if (out_ptr[i]) {
+            for (int j = 0; j < 8 * sizeof(out_ptr[0]); j++) {
+                if (out_ptr[i] & (1 << j)) {
+                    size_t npixel = i * 8 * sizeof(out_ptr[0])| j;
+                    size_t line = npixel / xpixels;
+                    size_t col = npixel % xpixels;
+                    pixel_set.AddStrongPixel(col, line, host_in[npixel]);
+                }
+            }
+        }
     }
 }
 
 void GPUImageAnalysis::GetSpotFinderResults(const DiffractionExperiment &experiment,
-                                  const JFJochProtoBuf::DataProcessingSettings &settings,
-                                  std::vector<DiffractionSpot> &vec) {
+                                            const JFJochProtoBuf::DataProcessingSettings &settings,
+                                            std::vector<DiffractionSpot> &vec) {
     StrongPixelSet pixel_set(experiment);
     GetSpotFinderResults(pixel_set);
     pixel_set.FindSpots(experiment, settings, vec);
@@ -337,16 +360,9 @@ void GPUImageAnalysis::UnregisterBuffer() {
     cudaHostUnregister(host_in);
 }
 
-void GPUImageAnalysis::LoadDataToGPU(bool apply_pixel_mask_on_gpu) {
+void GPUImageAnalysis::LoadDataToGPU() {
     if (host_in == nullptr)
         throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Host/GPU buffer not defined");
 
     cuda_err(cudaMemcpy(gpu_in, host_in, xpixels * ypixels * sizeof(int16_t), cudaMemcpyHostToDevice));
-
-    if (apply_pixel_mask_on_gpu) {
-        const auto nblocks =
-                xpixels * ypixels / numberOfCudaThreads + ((xpixels * ypixels % numberOfCudaThreads == 0) ? 0 : 1);
-        apply_pixel_mask<<<nblocks, numberOfCudaThreads, 0, cudastream->v>>>(gpu_in, gpu_mask, xpixels * ypixels);
-    }
 }
-
diff --git a/image_analysis/GPUImageAnalysis.h b/image_analysis/GPUImageAnalysis.h
index 5b4d0bb1..8e20818e 100644
--- a/image_analysis/GPUImageAnalysis.h
+++ b/image_analysis/GPUImageAnalysis.h
@@ -27,26 +27,21 @@ class GPUImageAnalysis {
     const int32_t ypixels;
 
     int16_t *host_in = nullptr;
-    uint8_t *gpu_mask = nullptr;
     int16_t *gpu_in = nullptr;
-    uint8_t *gpu_out = nullptr;
-    uint32_t *gpu_out_reduced = nullptr;
-    uint32_t *gpu_out_reduced_counter = nullptr;
-    uint32_t *host_out_reduced = nullptr;
-    uint32_t *host_out_reduced_counter = nullptr;
+    uint32_t *gpu_out = nullptr;
+    uint32_t *host_out = nullptr;
 
     int numberOfSMs;
     const int numberOfCudaThreads = 128;    // #threads per block that works well for Nvidia T4
     const int numberOfWaves = 40;           // #waves that works well for Nvidia T4
     const int windowSizeLimit = 21;         // limit on the window size (2nby+1, 2nbx+1) to prevent shared memory problems
-    const int maxStrongPixel = 65536;
 public:
 
-    GPUImageAnalysis(int32_t xpixels, int32_t ypixels, const std::vector<uint8_t> &mask);
+    GPUImageAnalysis(int32_t xpixels, int32_t ypixels);
     ~GPUImageAnalysis();
 
     void SetInputBuffer(void *host_in);
-    void LoadDataToGPU(bool apply_pixel_mask_on_gpu = true);
+    void LoadDataToGPU();
 
     void RunSpotFinder(const JFJochProtoBuf::DataProcessingSettings &settings);
     void GetSpotFinderResults(StrongPixelSet &pixel_set);
diff --git a/receiver/JFJochReceiver.cpp b/receiver/JFJochReceiver.cpp
index 460429b2..14d7278f 100644
--- a/receiver/JFJochReceiver.cpp
+++ b/receiver/JFJochReceiver.cpp
@@ -406,9 +406,7 @@ void JFJochReceiver::FrameTransformationThread() {
     std::unique_ptr<GPUImageAnalysis> spot_finder;
 
     try {
-        spot_finder = std::make_unique<GPUImageAnalysis>(experiment.GetXPixelsNum(),
-                                                         experiment.GetYPixelsNum(),
-                                                         one_byte_mask);
+        spot_finder = std::make_unique<GPUImageAnalysis>(experiment.GetXPixelsNum(), experiment.GetYPixelsNum());
         spot_finder->SetInputBuffer(transformation.GetPreview16BitImage());
         spot_finder->RegisterBuffer();
     } catch (const JFJochException& e) {
@@ -519,7 +517,7 @@ void JFJochReceiver::FrameTransformationThread() {
                 auto local_data_processing_settings = GetDataProcessingSettings();
 
                 if (calculate_spots) {
-                    spot_finder->LoadDataToGPU(!experiment.GetApplyPixelMaskInFPGA());
+                    spot_finder->LoadDataToGPU();
                     spot_finder->RunSpotFinder(local_data_processing_settings);
                     spot_finder->GetSpotFinderResults(experiment, GetDataProcessingSettings(), spots);
                     for (const auto &spot: spots)
diff --git a/tests/SpotFinderIntegration.cpp b/tests/SpotFinderIntegration.cpp
index 354dfe73..8eb52d4b 100644
--- a/tests/SpotFinderIntegration.cpp
+++ b/tests/SpotFinderIntegration.cpp
@@ -73,10 +73,7 @@ TEST_CASE("SpotFinder_GPU", "[GPUImageAnalysis]") {
     image[60000] = static_cast<int16_t>(mean + 15 * sqrt(mean));
     std::vector<double> threshold_vals = {3,4,6,10};
 
-    std::vector<uint8_t> one_byte_mask(experiment.GetPixelsNum(), 1);
-
-    GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(),
-                                 one_byte_mask);
+    GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum());
 
     spot_finder.SetInputBuffer(image.data());
 
@@ -112,63 +109,3 @@ TEST_CASE("SpotFinder_GPU", "[GPUImageAnalysis]") {
         std::cout << "Common: " << naive_cpu.Common(colspot_gpu) << " fast impl: " << colspot_gpu.Count() << " ref. impl: " << naive_cpu.Count() << std::endl;
     }
 }
-
-TEST_CASE("SpotFinder_GPU_Mask", "[GPUImageAnalysis]") {
-    DiffractionExperiment experiment(DetectorGeometry(1));
-    experiment.Mode(DetectorMode::Raw);
-
-    const uint16_t nbx = 5;
-
-    const double mean = 30;
-
-    std::vector<int16_t> image(512*1024);
-
-    // Predictable random number generator, so test always gives the same result
-    std::mt19937 g1(2023);
-    // Poissonian distribution, with mean == variance
-    std::normal_distribution<double> distribution(mean, sqrt(mean));
-
-    for (auto &i: image)
-        i = static_cast<int16_t>(distribution(g1));
-
-    image[2000] = static_cast<int16_t>(mean + 15 * sqrt(mean));
-    image[8000] = static_cast<int16_t>(mean + 15 * sqrt(mean));
-    image[14000] = static_cast<int16_t>(mean + 15 * sqrt(mean));
-    image[20000] = static_cast<int16_t>(mean + 15 * sqrt(mean));
-    image[25000] = static_cast<int16_t>(mean + 15 * sqrt(mean));
-    image[60000] = static_cast<int16_t>(mean + 15 * sqrt(mean));
-
-    std::vector<uint8_t> one_byte_mask(experiment.GetPixelsNum(), 1);
-
-    JFJochProtoBuf::DataProcessingSettings settings;
-    settings.set_local_bkg_size(nbx);
-    settings.set_photon_count_threshold(1);
-    settings.set_signal_to_noise_threshold(10);
-
-    {
-        GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(), one_byte_mask);
-        spot_finder.SetInputBuffer(image.data());
-
-        // Without mask
-        StrongPixelSet colspot_gpu(experiment);
-        spot_finder.LoadDataToGPU();
-        spot_finder.RunSpotFinder(settings);
-        spot_finder.GetSpotFinderResults(colspot_gpu);
-
-        REQUIRE(colspot_gpu.Count() == 6);
-    }
-    {
-        one_byte_mask[2000] = 0;
-        one_byte_mask[25000] = 0;
-        one_byte_mask[60000] = 0;
-        GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(), one_byte_mask);
-
-        StrongPixelSet colspot_gpu(experiment);
-        spot_finder.SetInputBuffer(image.data());
-        spot_finder.LoadDataToGPU();
-        spot_finder.RunSpotFinder(settings);
-        spot_finder.GetSpotFinderResults(colspot_gpu);
-
-        REQUIRE(colspot_gpu.Count() == 3);
-    }
-}
diff --git a/tools/DataAnalysisPerfTest.cpp b/tools/DataAnalysisPerfTest.cpp
index 6c1ea348..8dc39689 100644
--- a/tools/DataAnalysisPerfTest.cpp
+++ b/tools/DataAnalysisPerfTest.cpp
@@ -230,19 +230,19 @@ int main(int argc, char **argv) {
 
     logger.Info("COLSPOT NBX=NBY=5 (GPU)");
     if (GPUImageAnalysis::GPUPresent()) {
-        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask);
+        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum());
         TestSpotFinder(x, settings, local_peakfinder_gpu,image_conv.data(), nimages);
     }
     logger.Info("COLSPOT NBX=NBY=5 (GPU/no copy to device)");
     if (GPUImageAnalysis::GPUPresent()) {
-        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask);
+        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum());
         TestSpotFinderWithoutCopyToDevice(x, settings, local_peakfinder_gpu,image_conv.data(), nimages);
     }
 
     settings.set_local_bkg_size(3);
     logger.Info("COLSPOT NBX=NBY=3 (GPU)");
     if (GPUImageAnalysis::GPUPresent()) {
-        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask);
+        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum());
         TestSpotFinder(x, settings, local_peakfinder_gpu,image_conv.data(), nimages);
     }
 
@@ -250,7 +250,7 @@ int main(int argc, char **argv) {
 
     logger.Info("Full package");
     if (GPUImageAnalysis::GPUPresent()) {
-        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask);
+        GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum());
         TestAll(x, settings, local_peakfinder_gpu,image_conv.data(), nimages);
         TestAllWithROI(x, settings, local_peakfinder_gpu,image_conv.data(), nimages);
     }