Jungfraujoch/receiver/jfjoch_action_test.cpp

// Copyright (2019-2023) Paul Scherrer Institute

#include <iostream>

#include "../acquisition_device/PCIExpressDevice.h"
#include "../acquisition_device/HLSSimulatedDevice.h"
#include "JFJochReceiverTest.h"
#include "../tests/FPGAUnitTest.h"

void print_usage(Logger &logger) {
    logger.Info("Usage ./jfjoch_action_test {<options>} <path to repository>");
    logger.Info("Options:");
    logger.Info("   -R       raw");
    logger.Info("   -v       verbose");
    logger.Info("   -S<num>  number of summed frames");
    logger.Info("   -I       use 32-bit integer");
    logger.Info("   -s<num>  number of data streams (acquisition devices)");
    logger.Info("   -m<num>  number of modules");
    logger.Info("   -i<num>  number of images");
    logger.Info("   -N<num>  number of image processing threads");
    logger.Info("   -P<txt>  NUMA Policy (none|n2g2|n8g4|n8g4_hbm), none is default");
    logger.Info("   -D<path> use resonet deep learning model for resolution estimation - path to TorchScript");
    logger.Info("   -B<num>  size of send buffer in MiB (default 2048)");
}

int main(int argc, char **argv) {
    Logger logger("ActionTest");
    logger.Verbose(true);

    constexpr uint64_t clock_MHz = 200;
    uint16_t nstreams = 1;
    uint16_t nmodules = 1;
    uint16_t nsummation = 1;
    size_t nimages = 2;
    uint16_t nthreads = 64;
    bool verbose = false;
    std::string numa_policy_name;
    bool raw_data = false;
    bool force_32bit = false;
    std::string resonet_path;
    DetectorType detector_type = DetectorType::JUNGFRAU;
    bool hls_simulation = false;
    size_t send_buffer_size_MiB = 2048;

    if (argc == 1) {
        print_usage(logger);
        exit(EXIT_FAILURE);
    }

    int opt;
    while ((opt = getopt(argc, argv, "s:i:m:N:P:vRIS:D:EHB:")) != -1) {
        switch (opt) {
            case 'i':
                nimages = atol(optarg);
                break;
            case 'S':
                nsummation = atol(optarg);
                break;
            case 'm':
                nmodules = atol(optarg);
                break;
            case 's':
                nstreams = atol(optarg);
                break;
            case 'N':
                nthreads = atol(optarg);
                break;
            case 'v':
                verbose = true;
                break;
            case 'P':
                numa_policy_name = std::string(optarg);
                break;
            case 'R':
                raw_data = true;
                break;
            case 'I':
                force_32bit = true;
                break;
            case 'D':
                resonet_path = std::string(optarg);
                break;
            case 'E':
                detector_type = DetectorType::EIGER;
                break;
            case 'H':
                hls_simulation = true;
                break;
            case 'B':
                send_buffer_size_MiB = atol(optarg);
                break;
            default: /* '?' */
                print_usage(logger);
                exit(EXIT_FAILURE);
        }
    }

    if (optind != argc - 1) {
        print_usage(logger);
        exit(EXIT_FAILURE);
    }

    DiffractionExperiment x(DetectorSetup(DetectorGeometry(nmodules, 2, 8, 36, true), detector_type));

    if (raw_data)
        x.Mode(DetectorMode::Raw);
    else
        x.Mode(DetectorMode::Conversion);

    x.ImagesPerTrigger(nimages).Summation(nsummation).PedestalG0Frames(0).UseInternalPacketGenerator(true).PhotonEnergy_keV(12.4).NumTriggers(1);
    x.MaskModuleEdges(false).MaskChipEdges(false).BeamX_pxl(x.GetXPixelsNum()/ 2.0).BeamY_pxl(x.GetYPixelsNum()/ 2.0).DetectorDistance_mm(100);
    x.Compression(CompressionAlgorithm::BSHUF_LZ4).DataStreams(nstreams);
    x.SetUnitCell(UnitCell{.a = 79, .b = 79, .c = 37, .alpha = 90.0, .beta = 90.0, .gamma = 90.0});

    if (force_32bit)
        x.FPGAOutputMode(FPGAPixelOutput::Int32);
    if (!resonet_path.empty())
        x.NeuralNetModelPath(resonet_path);

    logger.Info("Data streams {} Total modules {} Total images {} Threads {}", nstreams, nmodules, nimages, nthreads);

    std::vector<std::string> dev_name = {
            "/dev/jfjoch0",
            "/dev/jfjoch2",
            "/dev/jfjoch1",
            "/dev/jfjoch3"
    };

    logger.Verbose(verbose);

    AcquisitionDeviceGroup aq_devices;

    std::string image_path = std::string(argv[optind]) + "/tests/test_data/mod5_raw0.bin";
    std::vector<uint16_t> input(RAW_MODULE_SIZE * x.GetModulesNum(), 0);
    std::vector<uint16_t> tmp(RAW_MODULE_SIZE);
    LoadBinaryFile(image_path, tmp.data(), RAW_MODULE_SIZE);
    for (int m = 0; m < x.GetModulesNum(); m++)
        memcpy(input.data() + RAW_MODULE_SIZE * m, tmp.data(), RAW_MODULE_SIZE * sizeof(uint16_t));

    if (hls_simulation) {
        if (nstreams != 1) {
            logger.Error("HLS simulation can work with only one device");
            exit(EXIT_FAILURE);
        }
        auto tmp = std::make_unique<HLSSimulatedDevice>(0, 128);
        tmp->EnableLogging(&logger);
        tmp->SetIPv4Address(0x010a0a0a);
        aq_devices.Add(std::move(tmp));
    } else {
        if (nstreams > dev_name.size()) {
            logger.Error("Only {} data streams allowed on this platform", dev_name.size());
            exit(EXIT_FAILURE);
        }

        for (int i = 0; i < nstreams; i++) {
            auto tmp = std::make_unique<PCIExpressDevice>(i, dev_name[i]);
            tmp->EnableLogging(&logger);
            tmp->SetDefaultMAC();
            tmp->SetIPv4Address((i << 24) + 0x010a0a0a);
            aq_devices.Add(std::move(tmp));
        }
    }

    volatile bool done = false;
    JFJochReceiverOutput output;
    bool ret;
    std::thread run_thread([&] {
        try {
            ret = JFJochReceiverTest(output, logger, aq_devices, x, input, nthreads, numa_policy_name, send_buffer_size_MiB);
        } catch (std::exception &e) {
            logger.Error(e.what());
            ret = false;
        }
        done = true;
    });

    while (!done) {
        for (int i = 0; i < nstreams; i++) {
            auto coll_status = aq_devices[i].GetDataCollectionStatus();
            auto dev_status = aq_devices[i].GetDeviceStatus();
            double power_3p3v = (dev_status.fpga_pcie_3p3V_I_mA * dev_status.fpga_pcie_3p3V_V_mV) / (1000.0 * 1000.0);
            double power_12v = (dev_status.fpga_pcie_12V_I_mA * dev_status.fpga_pcie_12V_V_mV) / (1000.0 * 1000.0);
            logger.Info("Device {}:  Slowest packet: {:8d}  Power: {:5.1f}+{:5.1f} W  FPGA Temp: {:d} degC  HBM Temp: {:d}/{:d} degC  Stalls: {:15d}/{:15d}/{:15d}",
                        i, aq_devices[i].Counters().GetSlowestFrameNumber(), power_12v, power_3p3v,
                        dev_status.fpga_temp_C, dev_status.hbm_0_temp_C, dev_status.hbm_1_temp_C,
                        coll_status.pipeline_stalls_hbm, coll_status.pipeline_stalls_proc, coll_status.pipeline_stalls_host);
        }
        std::this_thread::sleep_for(std::chrono::seconds(1));
    }

    run_thread.join();

    double receiving_time = static_cast<double>(output.end_time_ms - output.start_time_ms)/1000.0;

    logger.Info("Efficiency: {:.2f}%", output.efficiency * 100.f);
    logger.Info("Max delay: {}",output.status.max_receive_delay);
    logger.Info("Compression factor: {}x", output.status.compressed_ratio);
    logger.Info("Receiving time: {} s", receiving_time);
    logger.Info("Frame rate: {} Hz", static_cast<double>(nimages)/receiving_time);
    logger.Info("Total throughput: {:.2f} GB/s",
            static_cast<double>(nsummation * nimages*x.GetModulesNum()*RAW_MODULE_SIZE*sizeof(uint16_t)) / (receiving_time * 1e9));

    logger.Info("");
    for (int i = 0; i < nstreams; i++) {
        auto coll_status = aq_devices[i].GetDataCollectionStatus();
        auto stalls_hbm = coll_status.pipeline_stalls_hbm;
        auto stalls_host = coll_status.pipeline_stalls_host;

        uint64_t throughput_MBs = nimages * nsummation * x.GetModulesNum(i) * RAW_MODULE_SIZE * sizeof(uint16_t) * clock_MHz /
                                  (nimages * nsummation * x.GetModulesNum(i) * 128 * 128 + stalls_hbm);
        double performance = static_cast<double>(throughput_MBs) / 1000;

        logger.Info("Device {}:  stalls HBM: {}  stalls host: {}   est. performance: {:.2f} GB/s", i, stalls_hbm,
                    stalls_host, performance);
    }

    if (ret) {
        logger.Info("");
        logger.Info("Test properly executed! (check stall values manually)");
        exit(EXIT_SUCCESS);
    } else {
        logger.Info("Test finished with errors! (check stall values manually)");
        exit(EXIT_FAILURE);
    }
}