// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only #include #include "../acquisition_device/PCIExpressDevice.h" #include "../acquisition_device/HLSSimulatedDevice.h" #include "../receiver/JFJochReceiverTest.h" #include "../tests/CheckImageOutput.h" #include "../common/print_license.h" void print_usage(Logger &logger) { logger.Info("Usage ./jfjoch_fpga_test {} "); logger.Info("Options:"); logger.Info(" -v Verbose"); logger.Info(" -H Simulation with C HLS model (doesn't require FPGA device)"); logger.Info(" -E{} EIGER detector mode (with optional bit depth: 8, 16, 32)"); logger.Info(" -R Raw"); logger.Info(" -S Number of summed frames"); logger.Info(" -I Use 32-bit integer"); logger.Info(" -c Use 8-bit integer"); logger.Info(" -s Number of data streams (acquisition devices)"); logger.Info(" -m Number of modules"); logger.Info(" -i Number of images"); logger.Info(" -N Number of image processing threads"); logger.Info(" -P NUMA Policy (none|n2g2|n8g4|n8g4_hbm), none is default"); logger.Info(" -B Size of send buffer in MiB (default 2048)"); logger.Info(" -q Use Poisson lossy compression, with square root of counts"); logger.Info(" -T Use thresholding for low counts"); logger.Info(" -M Apply pixel mask"); logger.Info(" -0 No compression"); logger.Info(" -Z Fast Zstd (RLE only!) compression"); logger.Info(" -X Indexing (none|fft|ffbidx), ffbidx is default"); logger.Info(" -t Indexing thread pool size (default: 4)"); logger.Info(" -f FFT indexing search vectors"); logger.Info(" -Q Quick integration"); } int main(int argc, char **argv) { print_license("jfjoch_fpga_test"); Logger logger("jfjoch_fpga_test"); logger.Verbose(true); constexpr uint64_t clock_MHz = 200; uint16_t nstreams = 1; uint16_t nmodules = 1; uint16_t nsummation = 1; size_t nimages = 2; uint16_t nthreads = 64; IndexingAlgorithmEnum indexing = IndexingAlgorithmEnum::FFBIDX; uint16_t indexing_threads = 4; std::optional fft_num_vectors; bool verbose = false; std::string numa_policy_name; bool raw_data = false; bool force_32bit = false; bool force_8bit = false; bool apply_pixel_mask = false; bool quick_integrate = false; std::optional eiger_bit_depth; DetectorType detector_type = DetectorType::JUNGFRAU; bool hls_simulation = false; size_t send_buffer_size_MiB = 2048; std::optional lossy_compression_poisson; int64_t thresholding = 0; CompressionAlgorithm compr = CompressionAlgorithm::BSHUF_LZ4; if (argc == 1) { print_usage(logger); exit(EXIT_FAILURE); } int opt; while ((opt = getopt(argc, argv, "s:i:m:N:P:vRIS:E::HB:q:T:cM0ZX:t:f:Q")) != -1) { switch (opt) { case '0': compr = CompressionAlgorithm::NO_COMPRESSION; break; case 'Z': compr = CompressionAlgorithm::BSHUF_ZSTD_RLE; break; case 'i': nimages = atol(optarg); break; case 'S': nsummation = atol(optarg); break; case 'm': nmodules = atol(optarg); break; case 's': nstreams = atol(optarg); break; case 'N': nthreads = atol(optarg); break; case 'v': verbose = true; break; case 'P': numa_policy_name = std::string(optarg); break; case 'R': raw_data = true; break; case 'I': force_32bit = true; break; case 'c': force_8bit = true; break; case 'M': apply_pixel_mask = true; break; case 'E': detector_type = DetectorType::EIGER; if (optarg != nullptr) { eiger_bit_depth = atol(optarg); } break; case 'H': hls_simulation = true; break; case 'B': send_buffer_size_MiB = atol(optarg); break; case 'q': lossy_compression_poisson = atol(optarg); break; case 'T': thresholding = atol(optarg); break; case 'X': if (std::string(optarg) == "none") indexing = IndexingAlgorithmEnum::None; else if (std::string(optarg) == "fft" || std::string(optarg) == "FFT") indexing = IndexingAlgorithmEnum::FFT; else if (std::string(optarg) == "ffbidx" || std::string(optarg) == "FFBIDX") indexing = IndexingAlgorithmEnum::FFBIDX; break; case 't': indexing_threads = atol(optarg); break; case 'f': fft_num_vectors = atol(optarg); break; case 'Q': quick_integrate = true; break; default: /* '?' */ print_usage(logger); exit(EXIT_FAILURE); } } if (optind != argc - 1) { print_usage(logger); exit(EXIT_FAILURE); } DiffractionExperiment x(DetectorSetup(DetectorGeometryModular(nmodules, 2, 8, 36, true), detector_type)); if (raw_data) x.Raw(); x.ImagesPerTrigger(nimages).Summation(nsummation).PedestalG0Frames(0).UseInternalPacketGenerator(true). IncidentEnergy_keV(12.4).NumTriggers(1); x.MaskModuleEdges(false).MaskChipEdges(false).BeamX_pxl(x.GetXPixelsNum() / 2.0).BeamY_pxl(x.GetYPixelsNum() / 2.0). DetectorDistance_mm(100); x.Compression(compr).DataStreams(nstreams); x.SetUnitCell(UnitCell{.a = 79, .b = 79, .c = 37, .alpha = 90.0, .beta = 90.0, .gamma = 90.0}); x.LossyCompressionPoisson(lossy_compression_poisson); x.ApplyPixelMask(apply_pixel_mask); x.MaskChipEdges(true).MaskModuleEdges(true); PixelMask mask(x); IndexingSettings i_settings; i_settings.Algorithm(indexing); if (fft_num_vectors) i_settings.FFT_NumVectors(fft_num_vectors.value()); i_settings.IndexingThreads(indexing_threads); x.ImportIndexingSettings(i_settings); if (thresholding > 0) x.PixelValueLowThreshold(thresholding); if (force_32bit) x.BitDepthImage(32); else if (force_8bit) x.BitDepthImage(8); x.EigerBitDepth(eiger_bit_depth); logger.Info("Data streams {} Total modules {} Total images {} Threads {}", nstreams, nmodules, nimages, nthreads); std::vector dev_name = { "/dev/jfjoch0", "/dev/jfjoch1", "/dev/jfjoch2", "/dev/jfjoch3", "/dev/jfjoch4", "/dev/jfjoch5", }; logger.Verbose(verbose); AcquisitionDeviceGroup aq_devices; std::string image_path = std::string(argv[optind]) + "/tests/test_data/mod5_raw0.bin"; std::vector input(RAW_MODULE_SIZE * x.GetModulesNum(), 0); std::vector tmp(RAW_MODULE_SIZE); LoadBinaryFile(image_path, tmp.data(), RAW_MODULE_SIZE); for (int m = 0; m < x.GetModulesNum(); m++) memcpy(input.data() + RAW_MODULE_SIZE * m, tmp.data(), RAW_MODULE_SIZE * sizeof(uint16_t)); if (hls_simulation) { if (nstreams != 1) { logger.Error("HLS simulation can work with only one device"); exit(EXIT_FAILURE); } auto tmp = std::make_unique(0, 128); tmp->EnableLogging(&logger); aq_devices.Add(std::move(tmp)); } else { if (nstreams > dev_name.size()) { logger.Error("Only {} data streams allowed on this platform", dev_name.size()); exit(EXIT_FAILURE); } for (int i = 0; i < nstreams; i++) { auto tmp = std::make_unique(i, dev_name[i]); tmp->EnableLogging(&logger); tmp->SetIPv4Address((i << 24) + 0x010a0a0a); aq_devices.Add(std::move(tmp)); } } std::atomic done = false; JFJochReceiverOutput output; bool ret; std::thread run_thread([&] { try { ret = JFJochReceiverTest(output, logger, aq_devices, x, mask, input, nthreads, numa_policy_name, send_buffer_size_MiB, quick_integrate); } catch (std::exception &e) { logger.Error(e.what()); ret = false; } done = true; }); while (!done) { for (int i = 0; i < nstreams; i++) { auto coll_status = aq_devices[i].GetDataCollectionStatus(); auto dev_status = aq_devices[i].GetDeviceStatus(); double power_3p3v = (dev_status.fpga_pcie_3p3V_I_mA * dev_status.fpga_pcie_3p3V_V_mV) / (1000.0 * 1000.0); double power_12v = (dev_status.fpga_pcie_12V_I_mA * dev_status.fpga_pcie_12V_V_mV) / (1000.0 * 1000.0); logger.Info( "#{}: Slowest packet: {:8d} Pwr: {:4.1f}+{:4.1f}={:5.1f} W T FPGA/HBM1/HBM2: {:3d}/{:3d}/{:3d} degC Stalls: {:15d}/{:15d}", i, aq_devices[i].Counters().GetSlowestFrameNumber(), power_12v, power_3p3v, power_12v + power_3p3v, dev_status.fpga_temp_C, dev_status.hbm_0_temp_C, dev_status.hbm_1_temp_C, coll_status.pipeline_stalls_hbm, coll_status.pipeline_stalls_host); } std::this_thread::sleep_for(std::chrono::seconds(1)); } run_thread.join(); double receiving_time = static_cast(output.end_time_ms - output.start_time_ms) / 1000.0; logger.Info("Efficiency: {:.2f}%", output.efficiency * 100.f); if (output.status.max_receive_delay) logger.Info("Max delay: {}", output.status.max_receive_delay.value()); if (output.status.compressed_ratio) logger.Info("Compression factor: {}x", output.status.compressed_ratio.value()); logger.Info("Receiving time: {} s", receiving_time); logger.Info("Frame rate: {} Hz", static_cast(nimages) / receiving_time); logger.Info("Total throughput: {:.2f} GB/s", static_cast(nsummation * nimages * x.GetModulesNum() * RAW_MODULE_SIZE * x.GetBitDepthReadout() / 8) / (receiving_time * 1e9)); logger.Info(""); for (int i = 0; i < nstreams; i++) { auto coll_status = aq_devices[i].GetDataCollectionStatus(); auto stalls_hbm = coll_status.pipeline_stalls_hbm; auto stalls_host = coll_status.pipeline_stalls_host; uint64_t throughput_MBs = nimages * nsummation * x.GetModulesNum(i) * RAW_MODULE_SIZE * sizeof(uint16_t) * clock_MHz / (nimages * nsummation * x.GetModulesNum(i) * 128 * 128 + stalls_hbm); double performance = static_cast(throughput_MBs) / 1000; logger.Info("Device {}: stalls HBM: {} stalls host: {} est. performance: {:.2f} GB/s", i, stalls_hbm, stalls_host, performance); } if (ret) { logger.Info(""); logger.Info("Test properly executed! (check stall values manually)"); exit(EXIT_SUCCESS); } else { logger.Info("Test finished with errors! (check stall values manually)"); exit(EXIT_FAILURE); } }