Added fitting, fixed roi etc (#129)

Co-authored-by: Patrick <patrick.sieberer@psi.ch> Co-authored-by: JulianHeymes <julian.heymes@psi.ch>
2025-06-18 02:07:13 +02:00 · 2025-02-12 16:50:31 +01:00
parent 7d6223d52d
commit dadf5f4869
55 changed files with 2931 additions and 693 deletions
--- a/include/aare/CircularFifo.hpp
+++ b/include/aare/CircularFifo.hpp
@ -0,0 +1,97 @@
+#pragma once
+
+#include <chrono>
+#include <fmt/color.h>
+#include <fmt/format.h>
+#include <memory>
+#include <thread>
+
+#include "aare/ProducerConsumerQueue.hpp"
+
+namespace aare {
+
+template <class ItemType> class CircularFifo {
+    uint32_t fifo_size;
+    aare::ProducerConsumerQueue<ItemType> free_slots;
+    aare::ProducerConsumerQueue<ItemType> filled_slots;
+
+  public:
+    CircularFifo() : CircularFifo(100){};
+    CircularFifo(uint32_t size) : fifo_size(size), free_slots(size + 1), filled_slots(size + 1) {
+
+        // TODO! how do we deal with alignment for writing? alignas???
+        // Do we give the user a chance to provide memory locations?
+        // Templated allocator?
+        for (size_t i = 0; i < fifo_size; ++i) {
+            free_slots.write(ItemType{});
+        }
+    }
+
+    bool next() {
+        // TODO! avoid default constructing ItemType
+        ItemType it;
+        if (!filled_slots.read(it))
+            return false;
+        if (!free_slots.write(std::move(it)))
+            return false;
+        return true;
+    }
+
+    ~CircularFifo() {}
+
+    using value_type = ItemType;
+
+    auto numFilledSlots() const noexcept { return filled_slots.sizeGuess(); }
+    auto numFreeSlots() const noexcept { return free_slots.sizeGuess(); }
+    auto isFull() const noexcept { return filled_slots.isFull(); }
+
+    ItemType pop_free() {
+        ItemType v;
+        while (!free_slots.read(v))
+            ;
+        return std::move(v);
+        // return v;
+    }
+
+    bool try_pop_free(ItemType &v) { return free_slots.read(v); }
+
+    ItemType pop_value(std::chrono::nanoseconds wait, std::atomic<bool> &stopped) {
+        ItemType v;
+        while (!filled_slots.read(v) && !stopped) {
+            std::this_thread::sleep_for(wait);
+        }
+        return std::move(v);
+    }
+
+    ItemType pop_value() {
+        ItemType v;
+        while (!filled_slots.read(v))
+            ;
+        return std::move(v);
+    }
+
+    ItemType *frontPtr() { return filled_slots.frontPtr(); }
+
+    // TODO! Add function to move item from filled to free to be used
+    // with the frontPtr function
+
+    template <class... Args> void push_value(Args &&...recordArgs) {
+        while (!filled_slots.write(std::forward<Args>(recordArgs)...))
+            ;
+    }
+
+    template <class... Args> bool try_push_value(Args &&...recordArgs) {
+        return filled_slots.write(std::forward<Args>(recordArgs)...);
+    }
+
+    template <class... Args> void push_free(Args &&...recordArgs) {
+        while (!free_slots.write(std::forward<Args>(recordArgs)...))
+            ;
+    }
+
+    template <class... Args> bool try_push_free(Args &&...recordArgs) {
+        return free_slots.write(std::forward<Args>(recordArgs)...);
+    }
+};
+
+} // namespace aare
--- a/include/aare/ClusterCollector.hpp
+++ b/include/aare/ClusterCollector.hpp
@ -0,0 +1,52 @@
+#pragma once
+#include <atomic>
+#include <thread>
+
+#include "aare/ProducerConsumerQueue.hpp"
+#include "aare/ClusterVector.hpp"
+#include "aare/ClusterFinderMT.hpp"
+
+namespace aare {
+
+class ClusterCollector{
+        ProducerConsumerQueue<ClusterVector<int>>* m_source;
+        std::atomic<bool> m_stop_requested{false};
+        std::atomic<bool> m_stopped{true};
+        std::chrono::milliseconds m_default_wait{1};
+        std::thread m_thread;
+        std::vector<ClusterVector<int>> m_clusters;
+
+    void process(){
+        m_stopped = false;
+        fmt::print("ClusterCollector started\n");
+        while (!m_stop_requested || !m_source->isEmpty()) { 
+            if (ClusterVector<int> *clusters = m_source->frontPtr();
+                clusters != nullptr) {
+                m_clusters.push_back(std::move(*clusters));
+                m_source->popFront();
+            }else{
+                std::this_thread::sleep_for(m_default_wait);
+            }
+        }
+        fmt::print("ClusterCollector stopped\n");
+        m_stopped = true;
+    }
+
+    public:
+        ClusterCollector(ClusterFinderMT<uint16_t, double, int32_t>* source){
+            m_source = source->sink();
+            m_thread = std::thread(&ClusterCollector::process, this);
+        }
+        void stop(){
+            m_stop_requested = true;
+            m_thread.join();
+        }
+        std::vector<ClusterVector<int>> steal_clusters(){
+            if(!m_stopped){
+                throw std::runtime_error("ClusterCollector is still running");
+            }
+            return std::move(m_clusters);
+        }
+};
+
+} // namespace aare
--- a/include/aare/ClusterFile.hpp
+++ b/include/aare/ClusterFile.hpp
@ -33,6 +33,12 @@ typedef enum {
    pTopRight = 8
 } pixel;

+struct Eta2 {
+    double x;
+    double y;
+    corner c;
+};
+
 struct ClusterAnalysis {
    uint32_t c;
    int32_t tot;
@ -49,6 +55,19 @@ int32_t frame_number
 uint32_t number_of_clusters
 ....
 */
+
+/**
+ * @brief Class to read and write cluster files
+ * Expects data to be laid out as:
+ *
+ *
+ *       int32_t frame_number
+ *       uint32_t number_of_clusters
+ *       int16_t x, int16_t y, int32_t data[9] x number_of_clusters
+ *       int32_t frame_number
+ *       uint32_t number_of_clusters
+ *       etc.
+ */
 class ClusterFile {
    FILE *fp{};
    uint32_t m_num_left{};
@ -56,26 +75,61 @@ class ClusterFile {
    const std::string m_mode;

  public:
+    /**
+     * @brief Construct a new Cluster File object
+     * @param fname path to the file
+     * @param chunk_size number of clusters to read at a time when iterating
+     * over the file
+     * @param mode mode to open the file in. "r" for reading, "w" for writing,
+     * "a" for appending
+     * @throws std::runtime_error if the file could not be opened
+     */
    ClusterFile(const std::filesystem::path &fname, size_t chunk_size = 1000,
                const std::string &mode = "r");
+    
+    
    ~ClusterFile();
-    std::vector<Cluster3x3> read_clusters(size_t n_clusters);
-    std::vector<Cluster3x3> read_frame(int32_t &out_fnum);
-    void write_frame(int32_t frame_number,
-                     const ClusterVector<int32_t> &clusters);
-    std::vector<Cluster3x3>
-    read_cluster_with_cut(size_t n_clusters, double *noise_map, int nx, int ny);

+    /**
+     * @brief Read n_clusters clusters from the file discarding frame numbers.
+     * If EOF is reached the returned vector will have less than n_clusters
+     * clusters
+     */
+    ClusterVector<int32_t> read_clusters(size_t n_clusters);
+
+    /**
+     * @brief Read a single frame from the file and return the clusters. The
+     * cluster vector will have the frame number set.
+     * @throws std::runtime_error if the file is not opened for reading or the file pointer not
+     * at the beginning of a frame
+     */
+    ClusterVector<int32_t> read_frame();
+
+
+    void write_frame(const ClusterVector<int32_t> &clusters);
+    
+    // Need to be migrated to support NDArray and return a ClusterVector
+    // std::vector<Cluster3x3>
+    // read_cluster_with_cut(size_t n_clusters, double *noise_map, int nx, int ny);
+
+    /**
+     * @brief Return the chunk size
+     */
    size_t chunk_size() const { return m_chunk_size; }
+    
+    
+    /**
+     * @brief Close the file. If not closed the file will be closed in the destructor
+     */
    void close();
 };

 int analyze_data(int32_t *data, int32_t *t2, int32_t *t3, char *quad,
                 double *eta2x, double *eta2y, double *eta3x, double *eta3y);
-int analyze_cluster(Cluster3x3& cl, int32_t *t2, int32_t *t3, char *quad,
+int analyze_cluster(Cluster3x3 &cl, int32_t *t2, int32_t *t3, char *quad,
                    double *eta2x, double *eta2y, double *eta3x, double *eta3y);

-NDArray<double, 2> calculate_eta2( ClusterVector<int>& clusters);
-std::array<double,2> calculate_eta2( Cluster3x3& cl);
+NDArray<double, 2> calculate_eta2(ClusterVector<int> &clusters);
+Eta2 calculate_eta2(Cluster3x3 &cl);

 } // namespace aare
--- a/include/aare/ClusterFileSink.hpp
+++ b/include/aare/ClusterFileSink.hpp
@ -0,0 +1,56 @@
+#pragma once
+#include <atomic>
+#include <filesystem>
+#include <thread>
+
+#include "aare/ProducerConsumerQueue.hpp"
+#include "aare/ClusterVector.hpp"
+#include "aare/ClusterFinderMT.hpp"
+
+namespace aare{
+
+class ClusterFileSink{
+    ProducerConsumerQueue<ClusterVector<int>>* m_source;
+    std::atomic<bool> m_stop_requested{false};
+    std::atomic<bool> m_stopped{true};
+    std::chrono::milliseconds m_default_wait{1};
+    std::thread m_thread;
+    std::ofstream m_file;
+
+
+    void process(){
+        m_stopped = false;
+        fmt::print("ClusterFileSink started\n");
+        while (!m_stop_requested || !m_source->isEmpty()) { 
+            if (ClusterVector<int> *clusters = m_source->frontPtr();
+                clusters != nullptr) {
+                // Write clusters to file
+                int32_t frame_number = clusters->frame_number(); //TODO! Should we store frame number already as int?
+                uint32_t num_clusters = clusters->size();
+                m_file.write(reinterpret_cast<const char*>(&frame_number), sizeof(frame_number));
+                m_file.write(reinterpret_cast<const char*>(&num_clusters), sizeof(num_clusters));
+                m_file.write(reinterpret_cast<const char*>(clusters->data()), clusters->size() * clusters->item_size());
+                m_source->popFront();
+            }else{
+                std::this_thread::sleep_for(m_default_wait);
+            }
+        }
+        fmt::print("ClusterFileSink stopped\n");
+        m_stopped = true;
+    }
+
+    public:
+        ClusterFileSink(ClusterFinderMT<uint16_t, double, int32_t>* source, const std::filesystem::path& fname){
+            m_source = source->sink();
+            m_thread = std::thread(&ClusterFileSink::process, this);
+            m_file.open(fname, std::ios::binary);
+        }
+        void stop(){
+            m_stop_requested = true;
+            m_thread.join();
+            m_file.close();
+        }
+};
+
+
+} // namespace aare
--- a/include/aare/ClusterFinder.hpp
+++ b/include/aare/ClusterFinder.hpp
@ -10,26 +10,12 @@

 namespace aare {

-/** enum to define the event types */
-enum class eventType {
-    PEDESTAL,   /** pedestal */
-    NEIGHBOUR,  /** neighbour i.e. below threshold, but in the cluster of a
-                   photon */
-    PHOTON,     /** photon i.e. above threshold */
-    PHOTON_MAX, /** maximum of a cluster satisfying the photon conditions */
-    NEGATIVE_PEDESTAL, /** negative value, will not be accounted for as pedestal
-                          in order to avoid drift of the pedestal towards
-                          negative values */
-    UNDEFINED_EVENT = -1 /** undefined */
-};
-
 template <typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
          typename CT = int32_t>
 class ClusterFinder {
    Shape<2> m_image_size;
    const int m_cluster_sizeX;
    const int m_cluster_sizeY;
-    // const PEDESTAL_TYPE m_threshold;
    const PEDESTAL_TYPE m_nSigma;
    const PEDESTAL_TYPE c2;
    const PEDESTAL_TYPE c3;
@ -61,6 +47,7 @@ class ClusterFinder {

    NDArray<PEDESTAL_TYPE, 2> pedestal() { return m_pedestal.mean(); }
    NDArray<PEDESTAL_TYPE, 2> noise() { return m_pedestal.std(); }
+    void clear_pedestal() { m_pedestal.clear(); }

    /**
     * @brief Move the clusters from the ClusterVector in the ClusterFinder to a
@ -78,13 +65,13 @@ class ClusterFinder {
            m_clusters = ClusterVector<CT>(m_cluster_sizeX, m_cluster_sizeY);
        return tmp;
    }
-    void find_clusters(NDView<FRAME_TYPE, 2> frame) {
+    void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0) {
        // // TODO! deal with even size clusters
        // // currently 3,3 -> +/- 1
        // //  4,4 -> +/- 2
        int dy = m_cluster_sizeY / 2;
        int dx = m_cluster_sizeX / 2;
-
+        m_clusters.set_frame_number(frame_number);
        std::vector<CT> cluster_data(m_cluster_sizeX * m_cluster_sizeY);
        for (int iy = 0; iy < frame.shape(0); iy++) {
            for (int ix = 0; ix < frame.shape(1); ix++) {
@ -121,8 +108,8 @@ class ClusterFinder {
                } else if (total > c3 * m_nSigma * rms) {
                    // pass
                } else {
-                    // m_pedestal.push(iy, ix, frame(iy, ix));
-                    m_pedestal.push_fast(iy, ix, frame(iy, ix));
+                    // m_pedestal.push(iy, ix, frame(iy, ix));   // Safe option
+                    m_pedestal.push_fast(iy, ix, frame(iy, ix)); // Assume we have reached n_samples in the pedestal, slight performance improvement
                    continue; // It was a pedestal value nothing to store
                }

@ -157,114 +144,6 @@ class ClusterFinder {
            }
        }
    }
-
-    // // template <typename FRAME_TYPE, typename PEDESTAL_TYPE>
-    // std::vector<DynamicCluster>
-    // find_clusters_with_threshold(NDView<FRAME_TYPE, 2> frame,
-    //                              Pedestal<PEDESTAL_TYPE> &pedestal) {
-    //     assert(m_threshold > 0);
-    //     std::vector<DynamicCluster> clusters;
-    //     std::vector<std::vector<eventType>> eventMask;
-    //     for (int i = 0; i < frame.shape(0); i++) {
-    //         eventMask.push_back(std::vector<eventType>(frame.shape(1)));
-    //     }
-    //     double tthr, tthr1, tthr2;
-
-    //     NDArray<FRAME_TYPE, 2> rest({frame.shape(0), frame.shape(1)});
-    //     NDArray<int, 2> nph({frame.shape(0), frame.shape(1)});
-    //     // convert to n photons
-    //     // nph = (frame-pedestal.mean()+0.5*m_threshold)/m_threshold; // can
-    //     be
-    //     // optimized with expression templates?
-    //     for (int iy = 0; iy < frame.shape(0); iy++) {
-    //         for (int ix = 0; ix < frame.shape(1); ix++) {
-    //             auto val = frame(iy, ix) - pedestal.mean(iy, ix);
-    //             nph(iy, ix) = (val + 0.5 * m_threshold) / m_threshold;
-    //             nph(iy, ix) = nph(iy, ix) < 0 ? 0 : nph(iy, ix);
-    //             rest(iy, ix) = val - nph(iy, ix) * m_threshold;
-    //         }
-    //     }
-    //     // iterate over frame pixels
-    //     for (int iy = 0; iy < frame.shape(0); iy++) {
-    //         for (int ix = 0; ix < frame.shape(1); ix++) {
-    //             eventMask[iy][ix] = eventType::PEDESTAL;
-    //             // initialize max and total
-    //             FRAME_TYPE max = std::numeric_limits<FRAME_TYPE>::min();
-    //             long double total = 0;
-    //             if (rest(iy, ix) <= 0.25 * m_threshold) {
-    //                 pedestal.push(iy, ix, frame(iy, ix));
-    //                 continue;
-    //             }
-    //             eventMask[iy][ix] = eventType::NEIGHBOUR;
-    //             // iterate over cluster pixels around the current pixel
-    //             (ix,iy) for (short ir = -(m_cluster_sizeY / 2);
-    //                  ir < (m_cluster_sizeY / 2) + 1; ir++) {
-    //                 for (short ic = -(m_cluster_sizeX / 2);
-    //                      ic < (m_cluster_sizeX / 2) + 1; ic++) {
-    //                     if (ix + ic >= 0 && ix + ic < frame.shape(1) &&
-    //                         iy + ir >= 0 && iy + ir < frame.shape(0)) {
-    //                         auto val = frame(iy + ir, ix + ic) -
-    //                                    pedestal.mean(iy + ir, ix + ic);
-    //                         total += val;
-    //                         if (val > max) {
-    //                             max = val;
-    //                         }
-    //                     }
-    //                 }
-    //             }
-
-    //             auto rms = pedestal.std(iy, ix);
-    //             if (m_nSigma == 0) {
-    //                 tthr = m_threshold;
-    //                 tthr1 = m_threshold;
-    //                 tthr2 = m_threshold;
-    //             } else {
-    //                 tthr = m_nSigma * rms;
-    //                 tthr1 = m_nSigma * rms * c3;
-    //                 tthr2 = m_nSigma * rms * c2;
-
-    //                 if (m_threshold > 2 * tthr)
-    //                     tthr = m_threshold - tthr;
-    //                 if (m_threshold > 2 * tthr1)
-    //                     tthr1 = tthr - tthr1;
-    //                 if (m_threshold > 2 * tthr2)
-    //                     tthr2 = tthr - tthr2;
-    //             }
-    //             if (total > tthr1 || max > tthr) {
-    //                 eventMask[iy][ix] = eventType::PHOTON;
-    //                 nph(iy, ix) += 1;
-    //                 rest(iy, ix) -= m_threshold;
-    //             } else {
-    //                 pedestal.push(iy, ix, frame(iy, ix));
-    //                 continue;
-    //             }
-    //             if (eventMask[iy][ix] == eventType::PHOTON &&
-    //                 frame(iy, ix) - pedestal.mean(iy, ix) >= max) {
-    //                 eventMask[iy][ix] = eventType::PHOTON_MAX;
-    //                 DynamicCluster cluster(m_cluster_sizeX, m_cluster_sizeY,
-    //                                        Dtype(typeid(FRAME_TYPE)));
-    //                 cluster.x = ix;
-    //                 cluster.y = iy;
-    //                 short i = 0;
-    //                 for (short ir = -(m_cluster_sizeY / 2);
-    //                      ir < (m_cluster_sizeY / 2) + 1; ir++) {
-    //                     for (short ic = -(m_cluster_sizeX / 2);
-    //                          ic < (m_cluster_sizeX / 2) + 1; ic++) {
-    //                         if (ix + ic >= 0 && ix + ic < frame.shape(1) &&
-    //                             iy + ir >= 0 && iy + ir < frame.shape(0)) {
-    //                             auto tmp = frame(iy + ir, ix + ic) -
-    //                                        pedestal.mean(iy + ir, ix + ic);
-    //                             cluster.set<FRAME_TYPE>(i, tmp);
-    //                             i++;
-    //                         }
-    //                     }
-    //                 }
-    //                 clusters.push_back(cluster);
-    //             }
-    //         }
-    //     }
-    //     return clusters;
-    // }
 };

 } // namespace aare
--- a/include/aare/ClusterFinderMT.hpp
+++ b/include/aare/ClusterFinderMT.hpp
@ -0,0 +1,268 @@
+#pragma once
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "aare/ClusterFinder.hpp"
+#include "aare/NDArray.hpp"
+#include "aare/ProducerConsumerQueue.hpp"
+
+namespace aare {
+
+enum class FrameType {
+    DATA,
+    PEDESTAL,
+};
+
+struct FrameWrapper {
+    FrameType type;
+    uint64_t frame_number;
+    NDArray<uint16_t, 2> data;
+};
+
+/**
+ * @brief ClusterFinderMT is a multi-threaded version of ClusterFinder. It uses
+ * a producer-consumer queue to distribute the frames to the threads. The
+ * clusters are collected in a single output queue.
+ * @tparam FRAME_TYPE type of the frame data
+ * @tparam PEDESTAL_TYPE type of the pedestal data
+ * @tparam CT type of the cluster data
+ */
+template <typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
+          typename CT = int32_t>
+class ClusterFinderMT {
+    size_t m_current_thread{0};
+    size_t m_n_threads{0};
+    using Finder = ClusterFinder<FRAME_TYPE, PEDESTAL_TYPE, CT>;
+    using InputQueue = ProducerConsumerQueue<FrameWrapper>;
+    using OutputQueue = ProducerConsumerQueue<ClusterVector<int>>;
+    std::vector<std::unique_ptr<InputQueue>> m_input_queues;
+    std::vector<std::unique_ptr<OutputQueue>> m_output_queues;
+
+    OutputQueue m_sink{1000}; // All clusters go into this queue
+
+    std::vector<std::unique_ptr<Finder>> m_cluster_finders;
+    std::vector<std::thread> m_threads;
+    std::thread m_collect_thread;
+    std::chrono::milliseconds m_default_wait{1};
+
+    std::atomic<bool> m_stop_requested{false};
+    std::atomic<bool> m_processing_threads_stopped{true};
+
+    /**
+     * @brief Function called by the processing threads. It reads the frames
+     * from the input queue and processes them.
+     */
+    void process(int thread_id) {
+        auto cf = m_cluster_finders[thread_id].get();
+        auto q = m_input_queues[thread_id].get();
+        bool realloc_same_capacity = true;
+
+        while (!m_stop_requested || !q->isEmpty()) {
+            if (FrameWrapper *frame = q->frontPtr(); frame != nullptr) {
+
+                switch (frame->type) {
+                case FrameType::DATA:
+                    cf->find_clusters(frame->data.view(), frame->frame_number);
+                    m_output_queues[thread_id]->write(cf->steal_clusters(realloc_same_capacity));
+                    break;
+
+                case FrameType::PEDESTAL:
+                    m_cluster_finders[thread_id]->push_pedestal_frame(
+                        frame->data.view());
+                    break;
+                }
+
+                // frame is processed now discard it
+                m_input_queues[thread_id]->popFront();
+            } else {
+                std::this_thread::sleep_for(m_default_wait);
+            }
+        }
+    }
+
+    /**
+     * @brief Collect all the clusters from the output queues and write them to
+     * the sink
+     */
+    void collect() {
+        bool empty = true;
+        while (!m_stop_requested || !empty || !m_processing_threads_stopped) {
+            empty = true;
+            for (auto &queue : m_output_queues) {
+                if (!queue->isEmpty()) {
+
+                    while (!m_sink.write(std::move(*queue->frontPtr()))) {
+                        std::this_thread::sleep_for(m_default_wait);
+                    }
+                    queue->popFront();
+                    empty = false;
+                }
+            }
+        }
+    }
+
+  public:
+    /**
+     * @brief Construct a new ClusterFinderMT object
+     * @param image_size size of the image
+     * @param cluster_size size of the cluster
+     * @param nSigma number of sigma above the pedestal to consider a photon
+     * @param capacity initial capacity of the cluster vector. Should match
+     * expected number of clusters in a frame per frame.
+     * @param n_threads number of threads to use
+     */
+    ClusterFinderMT(Shape<2> image_size, Shape<2> cluster_size,
+                    PEDESTAL_TYPE nSigma = 5.0, size_t capacity = 2000,
+                    size_t n_threads = 3)
+        : m_n_threads(n_threads) {
+        for (size_t i = 0; i < n_threads; i++) {
+            m_cluster_finders.push_back(
+                std::make_unique<ClusterFinder<FRAME_TYPE, PEDESTAL_TYPE, CT>>(
+                    image_size, cluster_size, nSigma, capacity));
+        }
+        for (size_t i = 0; i < n_threads; i++) {
+            m_input_queues.emplace_back(std::make_unique<InputQueue>(200));
+            m_output_queues.emplace_back(std::make_unique<OutputQueue>(200));
+        }
+        //TODO! Should we start automatically?
+        start();
+    }
+
+    /**
+     * @brief Return the sink queue where all the clusters are collected
+     * @warning You need to empty this queue otherwise the cluster finder will wait forever
+     */
+    ProducerConsumerQueue<ClusterVector<int>> *sink() { return &m_sink; }
+
+    /**
+     * @brief Start all processing threads
+     */
+    void start() {
+        m_processing_threads_stopped = false;
+        m_stop_requested = false;
+
+        for (size_t i = 0; i < m_n_threads; i++) {
+            m_threads.push_back(
+                std::thread(&ClusterFinderMT::process, this, i));
+        }
+
+        m_collect_thread = std::thread(&ClusterFinderMT::collect, this);
+    }
+
+    /**
+     * @brief Stop all processing threads
+     */
+    void stop() {
+        m_stop_requested = true;
+
+        for (auto &thread : m_threads) {
+            thread.join();
+        }
+        m_threads.clear();
+
+        m_processing_threads_stopped = true;
+        m_collect_thread.join();
+    }
+
+    /**
+     * @brief Wait for all the queues to be empty. Mostly used for timing tests.
+     */
+    void sync() {
+        for (auto &q : m_input_queues) {
+            while (!q->isEmpty()) {
+                std::this_thread::sleep_for(m_default_wait);
+            }
+        }
+        for (auto &q : m_output_queues) {
+            while (!q->isEmpty()) {
+                std::this_thread::sleep_for(m_default_wait);
+            }
+        }
+        while (!m_sink.isEmpty()) {
+            std::this_thread::sleep_for(m_default_wait);
+        }
+    }
+
+    /**
+     * @brief Push a pedestal frame to all the cluster finders. The frames is
+     * expected to be dark. No photon finding is done. Just pedestal update.
+     */
+    void push_pedestal_frame(NDView<FRAME_TYPE, 2> frame) {
+        FrameWrapper fw{FrameType::PEDESTAL, 0,
+                        NDArray(frame)}; // TODO! copies the data!
+
+        for (auto &queue : m_input_queues) {
+            while (!queue->write(fw)) {
+                std::this_thread::sleep_for(m_default_wait);
+            }
+        }
+    }
+
+    /**
+     * @brief Push the frame to the queue of the next available thread. Function
+     * returns once the frame is in a queue.
+     * @note Spin locks with a default wait if the queue is full.
+     */
+    void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0) {
+        FrameWrapper fw{FrameType::DATA, frame_number,
+                        NDArray(frame)}; // TODO! copies the data!
+        while (!m_input_queues[m_current_thread % m_n_threads]->write(fw)) {
+            std::this_thread::sleep_for(m_default_wait);
+        }
+        m_current_thread++;
+    }
+
+    void clear_pedestal() {
+        if (!m_processing_threads_stopped) {
+            throw std::runtime_error("ClusterFinderMT is still running");
+        }
+        for (auto &cf : m_cluster_finders) {
+            cf->clear_pedestal();
+        }
+    }
+
+    /**
+     * @brief Return the pedestal currently used by the cluster finder
+     * @param thread_index index of the thread
+     */
+    auto pedestal(size_t thread_index = 0) {
+        if (m_cluster_finders.empty()) {
+            throw std::runtime_error("No cluster finders available");
+        }
+        if (!m_processing_threads_stopped) {
+            throw std::runtime_error("ClusterFinderMT is still running");
+        }
+        if (thread_index >= m_cluster_finders.size()) {
+            throw std::runtime_error("Thread index out of range");
+        }
+        return m_cluster_finders[thread_index]->pedestal();
+    }
+
+    /**
+     * @brief Return the noise currently used by the cluster finder
+     * @param thread_index index of the thread
+     */
+    auto noise(size_t thread_index = 0) {
+        if (m_cluster_finders.empty()) {
+            throw std::runtime_error("No cluster finders available");
+        }
+        if (!m_processing_threads_stopped) {
+            throw std::runtime_error("ClusterFinderMT is still running");
+        }
+        if (thread_index >= m_cluster_finders.size()) {
+            throw std::runtime_error("Thread index out of range");
+        }
+        return m_cluster_finders[thread_index]->noise();
+    }
+
+    // void push(FrameWrapper&& frame) {
+    //     //TODO! need to loop until we are successful
+    //     auto rc = m_input_queue.write(std::move(frame));
+    //     fmt::print("pushed frame {}\n", rc);
+    // }
+};
+
+} // namespace aare
--- a/include/aare/ClusterVector.hpp
+++ b/include/aare/ClusterVector.hpp
@ -1,4 +1,6 @@
 #pragma once
+#include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <numeric>
@ -9,19 +11,24 @@
 namespace aare {

 /**
- * @brief ClusterVector is a container for clusters of various sizes. It uses a 
- * contiguous memory buffer to store the clusters. 
+ * @brief ClusterVector is a container for clusters of various sizes. It uses a
+ * contiguous memory buffer to store the clusters. It is templated on the data
+ * type and the coordinate type of the clusters.
 * @note push_back can invalidate pointers to elements in the container
+ * @warning ClusterVector is currently move only to catch unintended copies, but
+ * this might change since there are probably use cases where copying is needed.
 * @tparam T data type of the pixels in the cluster
- * @tparam CoordType data type of the x and y coordinates of the cluster (normally int16_t)
+ * @tparam CoordType data type of the x and y coordinates of the cluster
+ * (normally int16_t)
 */
-template <typename T, typename CoordType=int16_t> class ClusterVector {
+template <typename T, typename CoordType = int16_t> class ClusterVector {
    using value_type = T;
    size_t m_cluster_size_x;
    size_t m_cluster_size_y;
    std::byte *m_data{};
    size_t m_size{0};
    size_t m_capacity;
+    uint64_t m_frame_number{0}; // TODO! Check frame number size and type
    /*
    Format string used in the python bindings to create a numpy
    array from the buffer
@ -30,7 +37,7 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
    d - double
    i - int
    */
-    constexpr static char m_fmt_base[] = "=h:x:\nh:y:\n({},{}){}:data:" ;
+    constexpr static char m_fmt_base[] = "=h:x:\nh:y:\n({},{}){}:data:";

  public:
    /**
@ -38,30 +45,31 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
     * @param cluster_size_x size of the cluster in x direction
     * @param cluster_size_y size of the cluster in y direction
     * @param capacity initial capacity of the buffer in number of clusters
+     * @param frame_number frame number of the clusters. Default is 0, which is
+     * also used to indicate that the clusters come from many frames
     */
-    ClusterVector(size_t cluster_size_x, size_t cluster_size_y,
-                  size_t capacity = 1024)
+    ClusterVector(size_t cluster_size_x = 3, size_t cluster_size_y = 3,
+                  size_t capacity = 1024, uint64_t frame_number = 0)
        : m_cluster_size_x(cluster_size_x), m_cluster_size_y(cluster_size_y),
-          m_capacity(capacity) {
+          m_capacity(capacity), m_frame_number(frame_number) {
        allocate_buffer(capacity);
    }
-    ~ClusterVector() {
-        delete[] m_data;
-    }

-   
-    //Move constructor
+    ~ClusterVector() { delete[] m_data; }
+
+    // Move constructor
    ClusterVector(ClusterVector &&other) noexcept
        : m_cluster_size_x(other.m_cluster_size_x),
          m_cluster_size_y(other.m_cluster_size_y), m_data(other.m_data),
-          m_size(other.m_size), m_capacity(other.m_capacity) {
+          m_size(other.m_size), m_capacity(other.m_capacity),
+          m_frame_number(other.m_frame_number) {
        other.m_data = nullptr;
        other.m_size = 0;
        other.m_capacity = 0;
    }

-    //Move assignment operator
-    ClusterVector& operator=(ClusterVector &&other) noexcept {
+    // Move assignment operator
+    ClusterVector &operator=(ClusterVector &&other) noexcept {
        if (this != &other) {
            delete[] m_data;
            m_cluster_size_x = other.m_cluster_size_x;
@ -69,9 +77,11 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
            m_data = other.m_data;
            m_size = other.m_size;
            m_capacity = other.m_capacity;
+            m_frame_number = other.m_frame_number;
            other.m_data = nullptr;
            other.m_size = 0;
            other.m_capacity = 0;
+            other.m_frame_number = 0;
        }
        return *this;
    }
@ -79,7 +89,8 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
    /**
     * @brief Reserve space for at least capacity clusters
     * @param capacity number of clusters to reserve space for
-     * @note If capacity is less than the current capacity, the function does nothing. 
+     * @note If capacity is less than the current capacity, the function does
+     * nothing.
     */
    void reserve(size_t capacity) {
        if (capacity > m_capacity) {
@ -92,7 +103,8 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
     * @param x x-coordinate of the cluster
     * @param y y-coordinate of the cluster
     * @param data pointer to the data of the cluster
-     * @warning The data pointer must point to a buffer of size cluster_size_x * cluster_size_y * sizeof(T)
+     * @warning The data pointer must point to a buffer of size cluster_size_x *
+     * cluster_size_y * sizeof(T)
     */
    void push_back(CoordType x, CoordType y, const std::byte *data) {
        if (m_size == m_capacity) {
@ -108,7 +120,15 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
                  ptr);
        m_size++;
    }
-
+    ClusterVector &operator+=(const ClusterVector &other) {
+        if (m_size + other.m_size > m_capacity) {
+            allocate_buffer(m_capacity + other.m_size);
+        }
+        std::copy(other.m_data, other.m_data + other.m_size * item_size(),
+                  m_data + m_size * item_size());
+        m_size += other.m_size;
+        return *this;
+    }

    /**
     * @brief Sum the pixels in each cluster
@ -116,7 +136,7 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
     */
    std::vector<T> sum() {
        std::vector<T> sums(m_size);
-        const size_t stride = element_offset();
+        const size_t stride = item_size();
        const size_t n_pixels = m_cluster_size_x * m_cluster_size_y;
        std::byte *ptr = m_data + 2 * sizeof(CoordType); // skip x and y

@ -129,26 +149,73 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
        return sums;
    }

-    size_t size() const { return m_size; }
-    size_t capacity() const { return m_capacity; }
-    
    /**
-     * @brief Return the offset in bytes for a single cluster
+     * @brief Return the maximum sum of the 2x2 subclusters in each cluster
+     * @return std::vector<T> vector of sums for each cluster
+     * @throws std::runtime_error if the cluster size is not 3x3
+     * @warning Only 3x3 clusters are supported for the 2x2 sum.
     */
-    size_t element_offset() const {
-        return 2*sizeof(CoordType)  +
+    std::vector<T> sum_2x2() {
+        std::vector<T> sums(m_size);
+        const size_t stride = item_size();
+
+        if (m_cluster_size_x != 3 || m_cluster_size_y != 3) {
+            throw std::runtime_error(
+                "Only 3x3 clusters are supported for the 2x2 sum.");
+        }
+        std::byte *ptr = m_data + 2 * sizeof(CoordType); // skip x and y
+
+        for (size_t i = 0; i < m_size; i++) {
+            std::array<T, 4> total;
+            auto T_ptr = reinterpret_cast<T *>(ptr);
+            total[0] = T_ptr[0] + T_ptr[1] + T_ptr[3] + T_ptr[4];
+            total[1] = T_ptr[1] + T_ptr[2] + T_ptr[4] + T_ptr[5];
+            total[2] = T_ptr[3] + T_ptr[4] + T_ptr[6] + T_ptr[7];
+            total[3] = T_ptr[4] + T_ptr[5] + T_ptr[7] + T_ptr[8];
+
+            sums[i] = *std::max_element(total.begin(), total.end());
+            ptr += stride;
+        }
+
+        return sums;
+    }
+
+    /**
+     * @brief Return the number of clusters in the vector
+     */
+    size_t size() const { return m_size; }
+
+    /**
+     * @brief Return the capacity of the buffer in number of clusters. This is
+     * the number of clusters that can be stored in the current buffer without
+     * reallocation.
+     */
+    size_t capacity() const { return m_capacity; }
+
+    /**
+     * @brief Return the size in bytes of a single cluster
+     */
+    size_t item_size() const {
+        return 2 * sizeof(CoordType) +
               m_cluster_size_x * m_cluster_size_y * sizeof(T);
    }
+
    /**
     * @brief Return the offset in bytes for the i-th cluster
     */
-    size_t element_offset(size_t i) const { return element_offset() * i; }
+    size_t element_offset(size_t i) const { return item_size() * i; }

    /**
     * @brief Return a pointer to the i-th cluster
     */
    std::byte *element_ptr(size_t i) { return m_data + element_offset(i); }
-     const std::byte *  element_ptr(size_t i) const { return m_data + element_offset(i); }
+
+    /**
+     * @brief Return a pointer to the i-th cluster
+     */
+    const std::byte *element_ptr(size_t i) const {
+        return m_data + element_offset(i);
+    }

    size_t cluster_size_x() const { return m_cluster_size_x; }
    size_t cluster_size_y() const { return m_cluster_size_y; }
@ -156,21 +223,49 @@ template <typename T, typename CoordType=int16_t> class ClusterVector {
    std::byte *data() { return m_data; }
    std::byte const *data() const { return m_data; }

-    template<typename V>
-    V& at(size_t i) {
-        return *reinterpret_cast<V*>(element_ptr(i));
+    /**
+     * @brief Return a reference to the i-th cluster casted to type V
+     * @tparam V type of the cluster
+     */
+    template <typename V> V &at(size_t i) {
+        return *reinterpret_cast<V *>(element_ptr(i));
    }

    const std::string_view fmt_base() const {
-        //TODO! how do we match on coord_t?
+        // TODO! how do we match on coord_t?
        return m_fmt_base;
    }

+    /**
+     * @brief Return the frame number of the clusters. 0 is used to indicate
+     * that the clusters come from many frames
+     */
+    uint64_t frame_number() const { return m_frame_number; }
+
+    void set_frame_number(uint64_t frame_number) {
+        m_frame_number = frame_number;
+    }
+
+    /**
+     * @brief Resize the vector to contain new_size clusters. If new_size is
+     * greater than the current capacity, a new buffer is allocated. If the size
+     * is smaller no memory is freed, size is just updated.
+     * @param new_size new size of the vector
+     * @warning The additional clusters are not initialized
+     */
+    void resize(size_t new_size) {
+        // TODO! Should we initialize the new clusters?
+        if (new_size > m_capacity) {
+            allocate_buffer(new_size);
+        }
+        m_size = new_size;
+    }
+
  private:
    void allocate_buffer(size_t new_capacity) {
-        size_t num_bytes = element_offset() * new_capacity;
+        size_t num_bytes = item_size() * new_capacity;
        std::byte *new_data = new std::byte[num_bytes]{};
-        std::copy(m_data, m_data + element_offset() * m_size, new_data);
+        std::copy(m_data, m_data + item_size() * m_size, new_data);
        delete[] m_data;
        m_data = new_data;
        m_capacity = new_capacity;
--- a/include/aare/File.hpp
+++ b/include/aare/File.hpp
@ -36,6 +36,8 @@ class File {
    File(File &&other) noexcept;
    File& operator=(File &&other) noexcept;
    ~File() = default;
+
+    // void close();                             //!< close the file
      
    Frame read_frame();                       //!< read one frame from the file at the current position
    Frame read_frame(size_t frame_index);     //!< read one frame at the position given by frame number
--- a/include/aare/Fit.hpp
+++ b/include/aare/Fit.hpp
@ -0,0 +1,76 @@
+#pragma once
+
+#include <cmath>
+#include <fmt/core.h>
+#include <vector>
+
+#include "aare/NDArray.hpp"
+
+namespace aare {
+
+namespace func {
+double gaus(const double x, const double *par);
+NDArray<double, 1> gaus(NDView<double, 1> x, NDView<double, 1> par);
+
+double pol1(const double x, const double *par);
+NDArray<double, 1> pol1(NDView<double, 1> x, NDView<double, 1> par);
+
+} // namespace func
+
+static constexpr int DEFAULT_NUM_THREADS = 4;
+
+/**
+ * @brief Fit a 1D Gaussian to data.
+ * @param data data to fit
+ * @param x x values
+ */
+NDArray<double, 1> fit_gaus(NDView<double, 1> x, NDView<double, 1> y);
+
+
+/**
+ * @brief Fit a 1D Gaussian to each pixel. Data layout [row, col, values]
+ * @param x x values
+ * @param y y vales, layout [row, col, values]
+ * @param n_threads number of threads to use
+ */
+NDArray<double, 3> fit_gaus(NDView<double, 1> x, NDView<double, 3> y, int n_threads = DEFAULT_NUM_THREADS);
+
+
+/**
+ * @brief Fit a 1D Gaussian with error estimates
+ * @param x x values
+ * @param y y vales, layout [row, col, values]
+ * @param y_err error in y, layout [row, col, values]
+ * @param par_out output parameters
+ * @param par_err_out output error parameters
+ */
+void fit_gaus(NDView<double, 1> x, NDView<double, 1> y, NDView<double, 1> y_err,
+              NDView<double, 1> par_out, NDView<double, 1> par_err_out);
+
+/**
+ * @brief Fit a 1D Gaussian to each pixel with error estimates. Data layout [row, col, values]
+ * @param x x values
+ * @param y y vales, layout [row, col, values]
+ * @param y_err error in y, layout [row, col, values]
+ * @param par_out output parameters, layout [row, col, values]
+ * @param par_err_out output parameter errors, layout [row, col, values]
+ * @param n_threads number of threads to use
+ */
+void fit_gaus(NDView<double, 1> x, NDView<double, 3> y, NDView<double, 3> y_err,
+              NDView<double, 3> par_out, NDView<double, 3> par_err_out, int n_threads = DEFAULT_NUM_THREADS);
+
+
+NDArray<double, 1> fit_pol1(NDView<double, 1> x, NDView<double, 1> y);
+
+NDArray<double, 3> fit_pol1(NDView<double, 1> x, NDView<double, 3> y, int n_threads = DEFAULT_NUM_THREADS);
+
+void fit_pol1(NDView<double, 1> x, NDView<double, 1> y,
+                NDView<double, 1> y_err, NDView<double, 1> par_out,
+                NDView<double, 1> par_err_out);
+
+//TODO! not sure we need to offer the different version in C++
+void fit_pol1(NDView<double, 1> x, NDView<double, 3> y,
+                NDView<double, 3> y_err, NDView<double, 3> par_out,
+                NDView<double, 3> par_err_out, int n_threads = DEFAULT_NUM_THREADS);
+
+} // namespace aare
--- a/include/aare/Pedestal.hpp
+++ b/include/aare/Pedestal.hpp
@ -89,6 +89,7 @@ template <typename SUM_TYPE = double> class Pedestal {
        m_sum = 0;
        m_sum2 = 0;
        m_cur_samples = 0;
+        m_mean = 0;
    }

    
@ -97,6 +98,7 @@ template <typename SUM_TYPE = double> class Pedestal {
        m_sum(row, col) = 0;
        m_sum2(row, col) = 0;
        m_cur_samples(row, col) = 0;
+        m_mean(row, col) = 0;
    }
    

@ -119,7 +121,7 @@ template <typename SUM_TYPE = double> class Pedestal {

    /**
     * Push but don't update the cached mean. Speeds up the process
-     * when intitializing the pedestal.
+     * when initializing the pedestal.
     * 
     */
    template <typename T> void push_no_update(NDView<T, 2> frame) {
@ -165,8 +167,8 @@ template <typename SUM_TYPE = double> class Pedestal {
            m_sum2(row, col) += val * val;
            m_cur_samples(row, col)++;
        } else {
-            m_sum(row, col) += val - m_sum(row, col) / m_cur_samples(row, col);
-            m_sum2(row, col) += val * val - m_sum2(row, col) / m_cur_samples(row, col);
+            m_sum(row, col) += val - m_sum(row, col) / m_samples;
+            m_sum2(row, col) += val * val - m_sum2(row, col) / m_samples;
        }
        //Since we just did a push we know that m_cur_samples(row, col) is at least 1
        m_mean(row, col) = m_sum(row, col) / m_cur_samples(row, col);
--- a/include/aare/ProducerConsumerQueue.hpp
+++ b/include/aare/ProducerConsumerQueue.hpp
@ -0,0 +1,203 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Bo Hu (bhu@fb.com)
+// @author Jordan DeLong (delong.j@fb.com)
+
+// Changes made by PSD Detector Group:
+// Copied: Line 34 constexpr std::size_t hardware_destructive_interference_size = 128; from folly/lang/Align.h
+// Changed extension to .hpp
+// Changed namespace to aare
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+constexpr std::size_t hardware_destructive_interference_size = 128;
+namespace aare {
+
+/*
+ * ProducerConsumerQueue is a one producer and one consumer queue
+ * without locks.
+ */
+template <class T> struct ProducerConsumerQueue {
+    typedef T value_type;
+
+    ProducerConsumerQueue(const ProducerConsumerQueue &) = delete;
+    ProducerConsumerQueue &operator=(const ProducerConsumerQueue &) = delete;
+
+    
+    ProducerConsumerQueue(ProducerConsumerQueue &&other){
+        size_ = other.size_;
+        records_ = other.records_;
+        other.records_ = nullptr;
+        readIndex_ = other.readIndex_.load(std::memory_order_acquire);
+        writeIndex_ = other.writeIndex_.load(std::memory_order_acquire);
+    }
+    ProducerConsumerQueue &operator=(ProducerConsumerQueue &&other){
+        size_ = other.size_;
+        records_ = other.records_;
+        other.records_ = nullptr;
+        readIndex_ = other.readIndex_.load(std::memory_order_acquire);
+        writeIndex_ = other.writeIndex_.load(std::memory_order_acquire);
+        return *this;
+    }
+    
+    
+    ProducerConsumerQueue():ProducerConsumerQueue(2){};
+    // size must be >= 2.
+    //
+    // Also, note that the number of usable slots in the queue at any
+    // given time is actually (size-1), so if you start with an empty queue,
+    // isFull() will return true after size-1 insertions.
+    explicit ProducerConsumerQueue(uint32_t size)
+        : size_(size), records_(static_cast<T *>(std::malloc(sizeof(T) * size))), readIndex_(0), writeIndex_(0) {
+        assert(size >= 2);
+        if (!records_) {
+            throw std::bad_alloc();
+        }
+    }
+
+    ~ProducerConsumerQueue() {
+        // We need to destruct anything that may still exist in our queue.
+        // (No real synchronization needed at destructor time: only one
+        // thread can be doing this.)
+        if (!std::is_trivially_destructible<T>::value) {
+            size_t readIndex = readIndex_;
+            size_t endIndex = writeIndex_;
+            while (readIndex != endIndex) {
+                records_[readIndex].~T();
+                if (++readIndex == size_) {
+                    readIndex = 0;
+                }
+            }
+        }
+
+        std::free(records_);
+    }
+
+    template <class... Args> bool write(Args &&...recordArgs) {
+        auto const currentWrite = writeIndex_.load(std::memory_order_relaxed);
+        auto nextRecord = currentWrite + 1;
+        if (nextRecord == size_) {
+            nextRecord = 0;
+        }
+        if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+            new (&records_[currentWrite]) T(std::forward<Args>(recordArgs)...);
+            writeIndex_.store(nextRecord, std::memory_order_release);
+            return true;
+        }
+
+        // queue is full
+        return false;
+    }
+
+    // move (or copy) the value at the front of the queue to given variable
+    bool read(T &record) {
+        auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+        if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+            // queue is empty
+            return false;
+        }
+
+        auto nextRecord = currentRead + 1;
+        if (nextRecord == size_) {
+            nextRecord = 0;
+        }
+        record = std::move(records_[currentRead]);
+        records_[currentRead].~T();
+        readIndex_.store(nextRecord, std::memory_order_release);
+        return true;
+    }
+
+    // pointer to the value at the front of the queue (for use in-place) or
+    // nullptr if empty.
+    T *frontPtr() {
+        auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+        if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+            // queue is empty
+            return nullptr;
+        }
+        return &records_[currentRead];
+    }
+
+    // queue must not be empty
+    void popFront() {
+        auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+        assert(currentRead != writeIndex_.load(std::memory_order_acquire));
+
+        auto nextRecord = currentRead + 1;
+        if (nextRecord == size_) {
+            nextRecord = 0;
+        }
+        records_[currentRead].~T();
+        readIndex_.store(nextRecord, std::memory_order_release);
+    }
+
+    bool isEmpty() const {
+        return readIndex_.load(std::memory_order_acquire) == writeIndex_.load(std::memory_order_acquire);
+    }
+
+    bool isFull() const {
+        auto nextRecord = writeIndex_.load(std::memory_order_acquire) + 1;
+        if (nextRecord == size_) {
+            nextRecord = 0;
+        }
+        if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+            return false;
+        }
+        // queue is full
+        return true;
+    }
+
+    // * If called by consumer, then true size may be more (because producer may
+    //   be adding items concurrently).
+    // * If called by producer, then true size may be less (because consumer may
+    //   be removing items concurrently).
+    // * It is undefined to call this from any other thread.
+    size_t sizeGuess() const {
+        int ret = writeIndex_.load(std::memory_order_acquire) - readIndex_.load(std::memory_order_acquire);
+        if (ret < 0) {
+            ret += size_;
+        }
+        return ret;
+    }
+
+    // maximum number of items in the queue.
+    size_t capacity() const { return size_ - 1; }
+
+  private:
+    using AtomicIndex = std::atomic<unsigned int>;
+
+    char pad0_[hardware_destructive_interference_size];
+    // const uint32_t size_;
+    uint32_t size_;
+    // T *const records_;
+    T* records_;
+
+    alignas(hardware_destructive_interference_size) AtomicIndex readIndex_;
+    alignas(hardware_destructive_interference_size) AtomicIndex writeIndex_;
+
+    char pad1_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+};
+
+} // namespace aare
--- a/include/aare/RawFile.hpp
+++ b/include/aare/RawFile.hpp
@ -34,15 +34,19 @@ class RawFile : public FileInterface {
    size_t n_subfile_parts{}; // d0,d1...dn
    //TODO! move to vector of SubFile instead of pointers
    std::vector<std::vector<RawSubFile *>> subfiles; //subfiles[f0,f1...fn][d0,d1...dn]
-    std::vector<xy> positions;
-    std::vector<ModuleGeometry> m_module_pixel_0;
+    // std::vector<xy> positions;
+    
    ModuleConfig cfg{0, 0};

    RawMasterFile m_master;

    size_t m_current_frame{};
-    size_t m_rows{};
-    size_t m_cols{};
+
+    // std::vector<ModuleGeometry> m_module_pixel_0;
+    // size_t m_rows{};
+    // size_t m_cols{};
+
+    DetectorGeometry m_geometry;

  public:
    /**
@ -111,11 +115,12 @@ class RawFile : public FileInterface {
     */
    static DetectorHeader read_header(const std::filesystem::path &fname);

-    void update_geometry_with_roi();
+    // void update_geometry_with_roi();
    int find_number_of_subfiles();

    void open_subfiles();
    void find_geometry();
 };

+
 } // namespace aare
--- a/include/aare/RawMasterFile.hpp
+++ b/include/aare/RawMasterFile.hpp
@ -62,17 +62,6 @@ class ScanParameters {
 };


-struct ROI{
-  int64_t xmin{};
-  int64_t xmax{};
-  int64_t ymin{};
-  int64_t ymax{};
-
-  int64_t height() const { return ymax - ymin; }
-  int64_t width() const { return xmax - xmin; }
-};
-
-
 /**
 * @brief Class for parsing a master file either in our .json format or the old
 * .raw format
--- a/include/aare/RawSubFile.hpp
+++ b/include/aare/RawSubFile.hpp
@ -66,6 +66,9 @@ class RawSubFile {
    size_t pixels_per_frame() const { return m_rows * m_cols; }
    size_t bytes_per_pixel() const { return m_bitdepth / 8; }

+private:
+  template <typename T>
+  void read_with_map(std::byte *image_buf);

 };

--- a/include/aare/decode.hpp
+++ b/include/aare/decode.hpp
@ -0,0 +1,13 @@
+#pragma once
+
+#include <cstdint>
+#include <aare/NDView.hpp>
+namespace aare {
+
+
+uint16_t adc_sar_05_decode64to16(uint64_t input);
+uint16_t adc_sar_04_decode64to16(uint64_t input);
+void adc_sar_05_decode64to16(NDView<uint64_t, 2> input, NDView<uint16_t,2> output);
+void adc_sar_04_decode64to16(NDView<uint64_t, 2> input, NDView<uint16_t,2> output);
+
+} // namespace aare
--- a/include/aare/defs.hpp
+++ b/include/aare/defs.hpp
@ -179,13 +179,42 @@ template <typename T> struct t_xy {
 using xy = t_xy<uint32_t>;


+/**
+ * @brief Class to hold the geometry of a module. Where pixel 0 is located and the size of the module
+ */
 struct ModuleGeometry{
-    int x{};
-    int y{};
+    int origin_x{};
+    int origin_y{};
    int height{};
    int width{};
+    int row_index{};
+    int col_index{}; 
 };

+/**
+ * @brief Class to hold the geometry of a detector. Number of modules, their size and where pixel 0 
+ * for each module is located
+ */
+struct DetectorGeometry{
+    int modules_x{};
+    int modules_y{};
+    int pixels_x{};
+    int pixels_y{};
+    int module_gap_row{};
+    int module_gap_col{};
+    std::vector<ModuleGeometry> module_pixel_0;
+};
+
+struct ROI{
+    int64_t xmin{};
+    int64_t xmax{};
+    int64_t ymin{};
+    int64_t ymax{};
+  
+    int64_t height() const { return ymax - ymin; }
+    int64_t width() const { return xmax - xmin; }
+  };
+

 using dynamic_shape = std::vector<int64_t>;

--- a/include/aare/geo_helpers.hpp
+++ b/include/aare/geo_helpers.hpp
@ -0,0 +1,16 @@
+#pragma once
+#include "aare/defs.hpp"
+#include "aare/RawMasterFile.hpp" //ROI refactor away
+namespace aare{
+
+/**
+ * @brief Update the detector geometry given a region of interest
+ * 
+ * @param geo 
+ * @param roi 
+ * @return DetectorGeometry 
+ */
+DetectorGeometry update_geometry_with_roi(DetectorGeometry geo, ROI roi);
+
+
+} // namespace aare
--- a/include/aare/utils/task.hpp
+++ b/include/aare/utils/task.hpp
@ -0,0 +1,8 @@
+
+#include <utility>
+#include <vector>
+
+namespace aare {
+std::vector<std::pair<int, int>> split_task(int first, int last, int n_threads);
+
+} // namespace aare