diff --git a/common/Definitions.h b/common/Definitions.h index 6412e6e0..3f76fdfe 100644 --- a/common/Definitions.h +++ b/common/Definitions.h @@ -49,7 +49,7 @@ // For FPGA #define ACTION_TYPE 0x52324158 -#define RELEASE_LEVEL 0x0041 +#define RELEASE_LEVEL 0x0042 #define MODE_CONV 0x0001L #define MODE_BITSHUFFLE_FPGA 0x0002L diff --git a/fpga/hdl/action_config.v b/fpga/hdl/action_config.v index bd773c7f..2533bea2 100644 --- a/fpga/hdl/action_config.v +++ b/fpga/hdl/action_config.v @@ -47,6 +47,7 @@ `define ADDR_ONE_OVER_ENERGY 16'h0094 `define ADDR_NFRAMES 16'h0098 `define ADDR_NSTORAGE_CELLS 16'h009C +`define ADDR_NSUMMATION 16'h00A0 `define ADDR_SPOT_FINDER_CNT_THR 16'h0100 `define ADDR_SPOT_FINDER_SNR_THR 16'h0104 @@ -90,6 +91,7 @@ module action_config output reg [31:0] nframes , output reg [7:0] nmodules , output reg [3:0] nstorage_cells , + output reg [3:0] nsummation , output wire [31:0] hbm_size_bytes , output reg [15:0] spot_finder_count_threshold, output reg [7:0] spot_finder_snr_threshold, @@ -312,6 +314,9 @@ always @(posedge clk) begin `ADDR_NSTORAGE_CELLS: begin rdata <= nstorage_cells; end + `ADDR_NSUMMATION: begin + rdata <= nsummation; + end `ADDR_ACTION_TYPE: begin rdata <= `ACTION_TYPE; end @@ -519,6 +524,15 @@ always @(posedge clk) begin end end +always @(posedge clk) begin + if (!resetn) + nsummation <= 0; + else if (reg_data_collection_idle) begin + if (w_hs && waddr == `ADDR_NSUMMATION) + nsummation <= (s_axi_WDATA[3:0] & wmask[3:0]) | (nsummation & !wmask[3:0]); + end +end + always @(posedge clk) begin if (!resetn) spot_finder_snr_threshold <= 0; diff --git a/fpga/hls/CMakeLists.txt b/fpga/hls/CMakeLists.txt index 5dd48ec9..7191dbd2 100644 --- a/fpga/hls/CMakeLists.txt +++ b/fpga/hls/CMakeLists.txt @@ -24,7 +24,9 @@ ADD_LIBRARY( HLSSimulation STATIC axis_helpers.cpp hls_bitshuffle.cpp add_multipixel.cpp - module_upside_down.cpp) + module_upside_down.cpp + frame_summation.cpp + frame_summation_reorder_compl.cpp) TARGET_INCLUDE_DIRECTORIES(HLSSimulation PUBLIC ../include) TARGET_LINK_LIBRARIES(HLSSimulation CommonFunctions) @@ -72,6 +74,8 @@ MAKE_HLS_MODULE(axis_32_to_512 axis_helpers.cpp "") MAKE_HLS_MODULE(adu_histo adu_histo.cpp "") MAKE_HLS_MODULE(add_multipixel add_multipixel.cpp add_multipixel_tb.cpp) MAKE_HLS_MODULE(module_upside_down module_upside_down.cpp module_upside_down_tb.cpp) +MAKE_HLS_MODULE(frame_summation frame_summation.cpp frame_summation_tb.cpp) +MAKE_HLS_MODULE(frame_summation_reorder_compl frame_summation_reorder_compl.cpp frame_summation_reorder_compl_tb.cpp) SET (HLS_IPS ${HLS_IPS} PARENT_SCOPE) ADD_CUSTOM_TARGET(hls DEPENDS ${HLS_IPS}) diff --git a/fpga/hls/adu_histo.cpp b/fpga/hls/adu_histo.cpp index ba944bc8..2eec2fa6 100644 --- a/fpga/hls/adu_histo.cpp +++ b/fpga/hls/adu_histo.cpp @@ -14,7 +14,7 @@ void adu_histo(STREAM_512 &data_in, #pragma HLS INTERFACE register both axis port=m_axis_completion #pragma HLS INTERFACE register both axis port=s_axis_completion - ap_uint<14> count[64][ADU_HISTO_BIN_COUNT]; // log2(512*1024/64) = 13 + ap_uint<19> count[64][ADU_HISTO_BIN_COUNT]; // log2(16*512*1024/64) = 17 #pragma HLS BIND_STORAGE variable=count type=ram_t2p impl=bram #pragma HLS ARRAY_PARTITION variable=count type=complete dim=1 @@ -29,13 +29,14 @@ void adu_histo(STREAM_512 &data_in, packet_512_t packet_in; data_in >> packet_in; + ap_uint<4> sum = ACT_REG_NSUMMATION(packet_in.data); // 0..15 data_out << packet_in; axis_completion cmpl; s_axis_completion >> cmpl; while (!cmpl.last) { m_axis_completion << cmpl; - for (int i = 0; i < RAW_MODULE_SIZE / (32 * 2); i++) { + for (int i = 0; i < (sum + 1) * RAW_MODULE_SIZE / (32 * 2); i++) { #pragma HLS PIPELINE II=2 for (int k = 0; k < 2; k++) { data_in >> packet_in; diff --git a/fpga/hls/data_collection_fsm.cpp b/fpga/hls/data_collection_fsm.cpp index c7eb92d0..e166b9b1 100644 --- a/fpga/hls/data_collection_fsm.cpp +++ b/fpga/hls/data_collection_fsm.cpp @@ -14,7 +14,8 @@ void data_collection_fsm(AXI_STREAM ð_in, ap_uint<32> one_over_energy, ap_uint<32> nframes, ap_uint<8> nmodules, - ap_uint<4> nstorage_cells) { + ap_uint<4> nstorage_cells, + ap_uint<4> nsummation) { #pragma HLS INTERFACE ap_ctrl_none port=return #pragma HLS INTERFACE axis register both port=eth_in @@ -30,6 +31,7 @@ void data_collection_fsm(AXI_STREAM ð_in, #pragma HLS INTERFACE ap_none register port=nframes #pragma HLS INTERFACE ap_none register port=nmodules #pragma HLS INTERFACE ap_none register port=nstorage_cells +#pragma HLS INTERFACE ap_none register port=nsummation #pragma HLS PIPELINE II=1 style=flp @@ -75,6 +77,7 @@ void data_collection_fsm(AXI_STREAM ð_in, ACT_REG_NFRAMES(packet_out.data) = nframes; ACT_REG_NMODULES(packet_out.data) = nmodules; ACT_REG_NSTORAGE_CELLS(packet_out.data) = nstorage_cells + 1; + ACT_REG_NSUMMATION(packet_out.data) = nsummation; packet_out.user = 0; packet_out.last = 0; diff --git a/fpga/hls/frame_summation.cpp b/fpga/hls/frame_summation.cpp new file mode 100644 index 00000000..91e8187a --- /dev/null +++ b/fpga/hls/frame_summation.cpp @@ -0,0 +1,92 @@ +// Copyright (2019-2023) Paul Scherrer Institute + +#include "hls_jfjoch.h" + +void frame_summation(STREAM_512 &data_in, STREAM_512 &data_out, + hls::stream &s_axis_completion, + hls::stream &m_axis_completion) { +#pragma HLS INTERFACE axis register both port=data_in +#pragma HLS INTERFACE axis register both port=data_out +#pragma HLS INTERFACE axis register both port=s_axis_completion +#pragma HLS INTERFACE axis register both port=m_axis_completion + + ap_uint<512> memory_0[16384]; +#pragma HLS BIND_STORAGE variable=memory_0 type=ram_t2p impl=uram latency=3 + + packet_512_t packet_in, packet_out; + data_in >> packet_in; + ap_uint<4> sum = ACT_REG_NSUMMATION(packet_in.data); // 0..15 + data_out << packet_in; + + data_in >> packet_in; + if (sum > 0) { + axis_completion cmpl, cmpl_out; + s_axis_completion >> cmpl; + while (!cmpl.last) { + later_frames: + cmpl_out = cmpl; + // Frame numbers start with 1, so need to do a bit of play here + cmpl_out.frame_number = cmpl.frame_number / (sum+1); + + for (int s = 0; s <= sum; s++) { + if (s > 0) { + cmpl_out.packet_mask = (cmpl_out.packet_mask & cmpl.packet_mask); + cmpl_out.packet_count += cmpl.packet_count; + } + if (s == sum) + m_axis_completion << cmpl_out; + + for (int i = 0; i < 16384; i++) { +#pragma HLS PIPELINE II=1 + ap_int<16> val_0[32]; + ap_int<16> val_1[32]; + unpack32(packet_in.data, val_0); + if (s == 0) + unpack32(0, val_1); + else + unpack32(memory_0[i], val_1); + + for (int j = 0; j < 32; j++) { + if ((val_0[j] == INT16_MIN) || (val_1[j] == INT16_MIN)) + val_0[j] = INT16_MIN; + else if ((val_0[j] == INT16_MAX) || (val_1[j] == INT16_MAX)) + val_0[j] = INT16_MAX; + else { + ap_int<17> tmp = val_0[j] + val_1[j]; + if (tmp >= INT16_MAX) + val_0[j] = INT16_MAX; + else + val_0[j] = tmp; + } + } + + if (s == sum) { + packet_out.data = pack32(val_0); + packet_out.last = ((i == 16383) ? 1 : 0); + packet_out.keep = UINT64_MAX; + data_out << packet_out; + } else { + memory_0[i] = pack32(val_0); + } + data_in >> packet_in; + } + s_axis_completion >> cmpl; + } + } + m_axis_completion << cmpl_out; + } else { + axis_completion cmpl; + s_axis_completion >> cmpl; + while (!cmpl.last) { + m_axis_completion << cmpl; + for (int i = 0; i < 16384; i++) { +#pragma HLS PIPELINE II=1 + data_out << packet_in; + data_in >> packet_in; + } + s_axis_completion >> cmpl; + } + m_axis_completion << cmpl; + } + data_out << packet_in; +} diff --git a/fpga/hls/frame_summation_reorder_compl.cpp b/fpga/hls/frame_summation_reorder_compl.cpp new file mode 100644 index 00000000..fdd2a91f --- /dev/null +++ b/fpga/hls/frame_summation_reorder_compl.cpp @@ -0,0 +1,77 @@ +// Copyright (2019-2023) Paul Scherrer Institute + +#include "hls_jfjoch.h" +#define MAX_FPGA_SUMMATION 16 + +void frame_summation_reorder_compl(STREAM_512 &data_in, + STREAM_512 &data_out, + hls::stream &s_axis_completion, + hls::stream &m_axis_completion) { +#pragma HLS INTERFACE axis register both port=data_in +#pragma HLS INTERFACE axis register both port=data_out +#pragma HLS INTERFACE axis register both port=s_axis_completion +#pragma HLS INTERFACE axis register both port=m_axis_completion + + packet_512_t packet_in; + data_in >> packet_in; + ap_uint<4> sum = ACT_REG_NSUMMATION(packet_in.data); // 0..15 + data_out << packet_in; + + axis_completion completions[MAX_FPGA_SUMMATION * MAX_MODULES_FPGA]; + ap_uint completion_mask[MAX_MODULES_FPGA]; + ap_uint<5> completion_count[MAX_MODULES_FPGA]; + + ap_uint<64> curr_frame_number_prefix[MAX_MODULES_FPGA]; + + for (int i = 0; i < MAX_MODULES_FPGA; i++) { + completion_mask[i] = 0; + completion_count[i] = 0; + curr_frame_number_prefix[i] = 0; + } + + axis_completion c; + s_axis_completion >> c; + while (!c.last) { +#pragma HLS PIPELINE II=16 + ap_uint<64> frame_number_prefix = c.frame_number / (sum + 1); + ap_uint<5> frame_number_loc = c.frame_number % (sum + 1); + ap_uint<7> module = c.module; + if (frame_number_prefix > curr_frame_number_prefix[module]) { + for (int i = 0; i <= sum; i++) { + axis_completion cmpl = completions[module * MAX_FPGA_SUMMATION + i]; + if (completion_count[module] != sum + 1) + cmpl.ignore = 1; + if (completion_mask[module][i]) + m_axis_completion << cmpl; + } + completions[module * MAX_FPGA_SUMMATION + frame_number_loc] = c; + completion_mask[module] = 1 << frame_number_loc; + completion_count[module] = 1; + curr_frame_number_prefix[module] = frame_number_prefix; + } else if (frame_number_prefix == curr_frame_number_prefix[module]) { + completions[module * MAX_FPGA_SUMMATION + frame_number_loc] = c; + completion_mask[module][frame_number_loc] = 1; + completion_count[module] += 1; + curr_frame_number_prefix[module] = frame_number_prefix; + } else { + c.ignore = 1; + m_axis_completion << c; + } + s_axis_completion >> c; + } + for (int module = 0; module < MAX_MODULES_FPGA; module++) { +#pragma HLS PIPELINE II=16 + for (int i = 0; i <= sum; i++) { + axis_completion cmpl = completions[module * MAX_FPGA_SUMMATION + i]; + if (completion_count[module] != sum + 1) + cmpl.ignore = 1; + if (completion_mask[module][i]) + m_axis_completion << cmpl; + } + } + + m_axis_completion << c; + + data_in >> packet_in; + data_out << packet_in; +} diff --git a/fpga/hls/frame_summation_reorder_compl_tb.cpp b/fpga/hls/frame_summation_reorder_compl_tb.cpp new file mode 100644 index 00000000..06d57020 --- /dev/null +++ b/fpga/hls/frame_summation_reorder_compl_tb.cpp @@ -0,0 +1,123 @@ +// Copyright (2019-2023) Paul Scherrer Institute + +#include "hls_jfjoch.h" + +void expected(int &ret, hls::stream& compl_in, size_t frame_number, uint16_t module, + uint16_t packet_count, uint16_t ignore) { + axis_completion cmpl = compl_in.read(); + if (cmpl.frame_number != frame_number) { + ret = 1; + std::cerr << "Frame number error " << cmpl.frame_number << " " << frame_number << std::endl; + } + if (cmpl.module != module) { + ret = 1; + std::cerr << "Module number error " << cmpl.module << " " << module << std::endl; + } + + if (cmpl.packet_count != packet_count) { + ret = 1; + std::cerr << "Packet count error " << cmpl.packet_count << " " << packet_count << std::endl; + } + if (cmpl.ignore != ignore) { + ret = 1; + std::cerr << "Ignore val error " << cmpl.ignore << " " << ignore << std::endl; + } +} + +int main() { + + int ret = 0; + + STREAM_512 input; + STREAM_512 output; + hls::stream compl_in; + hls::stream compl_out; + size_t nframes = 5; + + ap_uint<512> action_control = 0; + ACT_REG_NSUMMATION(action_control) = nframes - 1; + + input << packet_512_t { .data = action_control, .user = 0 }; + input << packet_512_t { .user = 1 }; + + compl_in << axis_completion{.frame_number = 100, .packet_count = 128, .module = 4, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 100, .packet_count = 128, .module = 5, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 100, .packet_count = 128, .module = 3, .last = 0, .ignore = 0}; + + compl_in << axis_completion{.frame_number = 101, .packet_count = 128, .module = 4, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 101, .packet_count = 128, .module = 5, .last = 0, .ignore = 0}; + + compl_in << axis_completion{.frame_number = 102, .packet_count = 128, .module = 5, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 102, .packet_count = 128, .module = 4, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 102, .packet_count = 128, .module = 3, .last = 0, .ignore = 0}; + + compl_in << axis_completion{.frame_number = 103, .packet_count = 128, .module = 5, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 103, .packet_count = 128, .module = 4, .last = 0, .ignore = 0}; + + compl_in << axis_completion{.frame_number = 104, .packet_count = 128, .module = 4, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 104, .packet_count = 128, .module = 5, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 80, .packet_count = 35, .module = 5, .last = 0, .ignore = 0}; + + compl_in << axis_completion{.frame_number = 105, .packet_count = 128, .module = 5, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 105, .packet_count = 128, .module = 4, .last = 0, .ignore = 0}; + compl_in << axis_completion{.frame_number = 105, .packet_count = 128, .module = 3, .last = 0, .ignore = 0}; + + compl_in << axis_completion{.last = 1}; + + size_t size_start = compl_in.size(); + + frame_summation_reorder_compl(input, output, compl_in, compl_out); + + if (compl_in.size() != 0) { + std::cout << "compl_in should be empty: " << compl_in.size() << std::endl; + ret = 1; + } + + if (compl_out.size() != size_start) { + std::cout << "compl_out should be size " << size_start << ": " << compl_out.size() << std::endl; + ret = 1; + } + + if (input.size() != 0) + ret = 1; + + if (output.size() != 2) + ret = 1; + + output.read(); + output.read(); + + expected(ret, compl_out, 80, 5, 35, 1); + + expected(ret, compl_out, 100, 5, 128, 0); + expected(ret, compl_out, 101, 5, 128, 0); + expected(ret, compl_out, 102, 5, 128, 0); + expected(ret, compl_out, 103, 5, 128, 0); + expected(ret, compl_out, 104, 5, 128, 0); + + expected(ret, compl_out, 100, 4, 128, 0); + expected(ret, compl_out, 101, 4, 128, 0); + expected(ret, compl_out, 102, 4, 128, 0); + expected(ret, compl_out, 103, 4, 128, 0); + expected(ret, compl_out, 104, 4, 128, 0); + expected(ret, compl_out, 100, 3, 128, 1); + expected(ret, compl_out, 102, 3, 128, 1); + expected(ret, compl_out, 105, 3, 128, 1); + expected(ret, compl_out, 105, 4, 128, 1); + expected(ret, compl_out, 105, 5, 128, 1); + + axis_completion cmpl = compl_out.read(); + if (!cmpl.last) { + std::cerr << "Last completion error" << std::endl; + ret = 1; + } + + if (ret != 0) { + printf("Test failed !!!\n"); + ret = 1; + } else { + printf("Test passed !\n"); + } + + return ret; +} diff --git a/fpga/hls/frame_summation_tb.cpp b/fpga/hls/frame_summation_tb.cpp new file mode 100644 index 00000000..a021d3ed --- /dev/null +++ b/fpga/hls/frame_summation_tb.cpp @@ -0,0 +1,101 @@ +// Copyright (2019-2023) Paul Scherrer Institute + +#include +#include "hls_jfjoch.h" + +int main() { + + int ret = 0; + + STREAM_512 input; + STREAM_512 output; + hls::stream compl_in; + hls::stream compl_out; + size_t nframes = 5; + std::vector input_frame(nframes * RAW_MODULE_SIZE); + std::vector output_frame_ref(RAW_MODULE_SIZE, 0); + std::vector output_frame(RAW_MODULE_SIZE, 0); + + std::mt19937 g1(1387); + std::uniform_int_distribution dist(0, 5000); + + for (int n = 0; n < nframes * RAW_MODULE_SIZE; n++) { + input_frame[n] = dist(g1); + output_frame_ref[n % RAW_MODULE_SIZE] += input_frame[n]; + } + + auto input_frame_512 = (ap_uint<512>*) input_frame.data(); + auto output_frame_512 = (ap_uint<512>*) output_frame.data(); + + ap_uint<512> action_control = 0; + ACT_REG_NSUMMATION(action_control) = nframes - 1; + + input << packet_512_t { .data = action_control, .user = 0 }; + for (int i = 0; i < nframes * RAW_MODULE_SIZE * sizeof(uint16_t) / 64; i++) + input << packet_512_t { .data = input_frame_512[i], .user = 0 }; + + input << packet_512_t { .user = 1 }; + + ap_uint<128> packet_mask; + for (int i = 0; i < 128; i++) + packet_mask[i] = 1; + + for (int i = 0; i < nframes; i++) + compl_in << axis_completion{.packet_mask = packet_mask, .frame_number = 100 + i, .packet_count = 128, .last = 0}; + compl_in << axis_completion{.last = 1}; + + frame_summation(input, output, compl_in, compl_out); + + if (compl_in.size() != 0) { + std::cout << "compl_in should be empty: " << compl_in.size() << std::endl; + ret = 1; + } + + if (compl_out.size() != 2) { + std::cout << "compl_out should be size 2: " << compl_out.size() << std::endl; + ret = 1; + } + + if (input.size() != 0) + ret = 1; + + if (output.size() != RAW_MODULE_SIZE * sizeof(uint16_t) / 64 + 2) + ret = 1; + + output.read(); + for (int i = 0; i < RAW_MODULE_SIZE * sizeof(uint16_t) / 64 ; i++) + output_frame_512[i] = output.read().data; + output.read(); + + axis_completion cmpl; + cmpl = compl_out.read(); + if (cmpl.frame_number != 100/5) { + std::cout << "Wrong output frame number" << std::endl; + ret = 1; + } + + if (cmpl.packet_count != 128*5) { + std::cout << "Wrong output frame number" << std::endl; + ret = 1; + } + + if (cmpl.packet_mask != packet_mask) { + std::cout << "Wrong packet mask" << std::endl; + ret = 1; + } + compl_out.read(); + + if (output_frame != output_frame_ref) { + std::cout << "Input and output don't match" << std::endl; + ret = 1; + } + + if (ret != 0) { + printf("Test failed !!!\n"); + ret = 1; + } else { + printf("Test passed !\n"); + } + + return ret; +} diff --git a/fpga/hls/hls_jfjoch.h b/fpga/hls/hls_jfjoch.h index b2c3bd11..5b842d95 100644 --- a/fpga/hls/hls_jfjoch.h +++ b/fpga/hls/hls_jfjoch.h @@ -52,6 +52,7 @@ typedef hls::stream STREAM_512; #define ACT_REG_NFRAMES(x) ((x)(95 , 64)) // 32 bit #define ACT_REG_NMODULES(x) ((x)(132, 128)) // 5 bit (0..31) #define ACT_REG_NSTORAGE_CELLS(x) ((x)(148, 144)) // 5 bit +#define ACT_REG_NSUMMATION(x) ((x)(163, 160)) // 4 bit (0..15) struct axis_datamover_ctrl { ap_uint<40+64> data; @@ -76,7 +77,7 @@ struct axis_completion { ap_uint<64> bunchid; ap_uint<32> debug; ap_uint<16> handle; - ap_uint<8> packet_count; + ap_uint<16> packet_count; ap_uint<7> module; ap_uint<1> last; ap_uint<1> ignore; @@ -261,7 +262,8 @@ void data_collection_fsm(AXI_STREAM ð_in, ap_uint<32> one_over_energy, ap_uint<32> nframes, ap_uint<8> nmodules, - ap_uint<4> nstorage_cells); + ap_uint<4> nstorage_cells, + ap_uint<4> nsummation); void host_writer(STREAM_512 &data_in, hls::stream> &adu_histo_in, @@ -332,4 +334,11 @@ void load_calibration(ap_uint<256> *d_hbm_p0, void add_multipixel(STREAM_512 &data_in, STREAM_512 &data_out); void module_upside_down(STREAM_512 &data_in, STREAM_512 &data_out); +void frame_summation(STREAM_512 &data_in, STREAM_512 &data_out, + hls::stream &s_axis_completion, + hls::stream &m_axis_completion); +void frame_summation_reorder_compl(STREAM_512 &data_in, + STREAM_512 &data_out, + hls::stream &s_axis_completion, + hls::stream &m_axis_completion); #endif diff --git a/fpga/hls/integration.cpp b/fpga/hls/integration.cpp index accd17f1..f4fc775e 100644 --- a/fpga/hls/integration.cpp +++ b/fpga/hls/integration.cpp @@ -28,11 +28,11 @@ void integration(STREAM_512 &data_in, #pragma HLS INTERFACE m_axi port=d_hbm_p3 bundle=d_hbm_p3 depth=512 offset=off \ max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8 - ap_fixed<46,30, AP_RND_CONV> sum[64][FPGA_INTEGRATION_BIN_COUNT]; - // log2(32768*512*1024/64) = 28 + sign 1 bit + ap_fixed<50,34, AP_RND_CONV> sum[64][FPGA_INTEGRATION_BIN_COUNT]; + // log2(32768*512*1024/64) = 32 + sign 1 bit #pragma HLS BIND_STORAGE variable=sum type=ram_t2p impl=bram #pragma HLS ARRAY_PARTITION variable=sum type=complete dim=1 - ap_uint<14> count[64][FPGA_INTEGRATION_BIN_COUNT]; // log2(512*1024/64) = 13 + ap_uint<18> count[64][FPGA_INTEGRATION_BIN_COUNT]; // log2(16*512*1024/64) = 17 #pragma HLS BIND_STORAGE variable=count type=ram_t2p impl=bram #pragma HLS ARRAY_PARTITION variable=count type=complete dim=1 @@ -53,6 +53,7 @@ void integration(STREAM_512 &data_in, packet_512_t packet_in; data_in >> packet_in; + ap_uint<4> nsum = ACT_REG_NSUMMATION(packet_in.data); // 0..15 data_out << packet_in; ap_uint<32> offset_hbm_0 = 16 * hbm_size_bytes / 32; @@ -63,32 +64,35 @@ void integration(STREAM_512 &data_in, axis_completion cmpl; s_axis_completion >> cmpl; while (!cmpl.last) { - m_axis_completion << cmpl; - for (int i = 0; i < RAW_MODULE_SIZE / 32 / 2; i++) { + for (int s = 0; s < nsum+1; s++) { + m_axis_completion << cmpl; + for (int i = 0; i < RAW_MODULE_SIZE / 32 / 2; i++) { #pragma HLS PIPELINE II=2 - for (int k = 0; k < 2; k++) { - data_in >> packet_in; - data_out << packet_in; - bins_0 = d_hbm_p0[offset_hbm_0 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; - bins_1 = d_hbm_p1[offset_hbm_1 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; - coeff_0 = d_hbm_p2[offset_hbm_2 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; - coeff_1 = d_hbm_p3[offset_hbm_2 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; + for (int k = 0; k < 2; k++) { + data_in >> packet_in; + data_out << packet_in; + bins_0 = d_hbm_p0[offset_hbm_0 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; + bins_1 = d_hbm_p1[offset_hbm_1 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; + coeff_0 = d_hbm_p2[offset_hbm_2 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; + coeff_1 = d_hbm_p3[offset_hbm_2 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; - unpack_2xhbm_to_32x16bit(bins_0, bins_1, in_bin); - unpack_2xhbm_to_32x16bit(coeff_0, coeff_1, in_coeff); + unpack_2xhbm_to_32x16bit(bins_0, bins_1, in_bin); + unpack_2xhbm_to_32x16bit(coeff_0, coeff_1, in_coeff); - unpack32(packet_in.data, in_val); + unpack32(packet_in.data, in_val); - for (int j = 0; j < 32; j++) { - ap_fixed<32,16, AP_RND_CONV> tmp = in_val[j] * in_coeff[j]; - if ((in_val[j] != INT16_MAX) && (in_val[j] != INT16_MIN) && (in_bin[j] < FPGA_INTEGRATION_BIN_COUNT)) { - sum[k * 32 + j][in_bin[j]] += tmp; - count[k * 32 + j][in_bin[j]] += 1; + for (int j = 0; j < 32; j++) { + ap_fixed<32, 16, AP_RND_CONV> tmp = in_val[j] * in_coeff[j]; + if ((in_val[j] != INT16_MAX) && (in_val[j] != INT16_MIN) && + (in_bin[j] < FPGA_INTEGRATION_BIN_COUNT)) { + sum[k * 32 + j][in_bin[j]] += tmp; + count[k * 32 + j][in_bin[j]] += 1; + } } } } + s_axis_completion >> cmpl; } - for (int i = 0; i < FPGA_INTEGRATION_BIN_COUNT; i++) { #pragma HLS PIPELINE II=1 ap_axiu<128,1,1,1> res; @@ -112,7 +116,6 @@ void integration(STREAM_512 &data_in, res.last = ((i == FPGA_INTEGRATION_BIN_COUNT - 1) ? 1 : 0); result_out << res; } - s_axis_completion >> cmpl; } m_axis_completion << cmpl; diff --git a/fpga/pcie_driver/ActionConfig.h b/fpga/pcie_driver/ActionConfig.h index c676d4eb..d48d50bf 100644 --- a/fpga/pcie_driver/ActionConfig.h +++ b/fpga/pcie_driver/ActionConfig.h @@ -19,6 +19,7 @@ struct DataCollectionConfig { uint32_t one_over_energy; uint32_t nframes; uint32_t nstorage_cells; + uint32_t nsummation; }; struct DataCollectionStatus { diff --git a/receiver/FPGAAcquisitionDevice.cpp b/receiver/FPGAAcquisitionDevice.cpp index 5a708102..94451639 100644 --- a/receiver/FPGAAcquisitionDevice.cpp +++ b/receiver/FPGAAcquisitionDevice.cpp @@ -183,6 +183,7 @@ void FPGAAcquisitionDevice::FillActionRegister(const DiffractionExperiment& x, D job.one_over_energy = std::lround((1<<20)/ x.GetPhotonEnergy_keV()); job.nstorage_cells = x.GetStorageCellNumber() - 1; job.mode = data_collection_id << 16; + job.nsummation = 0; if ((x.GetDetectorMode() == DetectorMode::Conversion) && x.GetConversionOnFPGA()) job.mode |= MODE_CONV; diff --git a/receiver/HLSSimulatedDevice.cpp b/receiver/HLSSimulatedDevice.cpp index acfcff89..5c2956dc 100644 --- a/receiver/HLSSimulatedDevice.cpp +++ b/receiver/HLSSimulatedDevice.cpp @@ -306,7 +306,8 @@ void HLSSimulatedDevice::HLSMainThread() { cfg.one_over_energy, cfg.nframes, cfg.nmodules, - cfg.nstorage_cells); + cfg.nstorage_cells, + cfg.nsummation); run_data_collection = 0; } }); diff --git a/tests/ActionConfigTest.cpp b/tests/ActionConfigTest.cpp index 3dc1e75e..7e716580 100644 --- a/tests/ActionConfigTest.cpp +++ b/tests/ActionConfigTest.cpp @@ -47,7 +47,7 @@ TEST_CASE("ActionStatus") { TEST_CASE("ActionConfigSize") { - REQUIRE(sizeof(DataCollectionConfig) == 5 * sizeof(uint32_t)); + REQUIRE(sizeof(DataCollectionConfig) == 6 * sizeof(uint32_t)); } TEST_CASE("ActionConfig") { diff --git a/tests/FPGAIntegrationTest.cpp b/tests/FPGAIntegrationTest.cpp index b5a385ce..d935499c 100644 --- a/tests/FPGAIntegrationTest.cpp +++ b/tests/FPGAIntegrationTest.cpp @@ -734,7 +734,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 1); REQUIRE(addr1.empty()); REQUIRE(raw1.empty()); @@ -751,7 +752,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 0); REQUIRE(addr1.empty()); REQUIRE(raw1.empty()); @@ -766,7 +768,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 0); REQUIRE(addr1.empty()); REQUIRE(raw1.empty()); @@ -784,7 +787,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 0); REQUIRE(addr1.empty()); REQUIRE(raw1.empty()); @@ -799,7 +803,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 0); REQUIRE(addr1.size() == 1); @@ -816,7 +821,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 0); REQUIRE(addr1.size() == 1); @@ -835,7 +841,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 0); REQUIRE(addr1.size() == 1); @@ -852,7 +859,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 0); REQUIRE(addr1.size() == 2); @@ -869,7 +877,8 @@ TEST_CASE("HLS_DataCollectionFSM","[OpenCAPI]") { act_reg.one_over_energy, act_reg.nframes, act_reg.nmodules, - act_reg.nstorage_cells); + act_reg.nstorage_cells, + act_reg.nsummation); REQUIRE(idle_data_collection == 1); REQUIRE(addr1.size() == 2);