511 lines
23 KiB
Plaintext
511 lines
23 KiB
Plaintext
// Copyright (2019-2023) Paul Scherrer Institute
|
|
|
|
#include "GPUImageAnalysis.h"
|
|
#include "../common/JFJochException.h"
|
|
#include "../common/DiffractionGeometry.h"
|
|
|
|
#include <sstream>
|
|
#include "../common/CUDAWrapper.h"
|
|
|
|
// input X x Y pixels array
|
|
// output X x Y byte array
|
|
|
|
struct CudaStreamWrapper {
|
|
cudaStream_t v;
|
|
};
|
|
|
|
inline void cuda_err(cudaError_t val) {
|
|
if (val != cudaSuccess)
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
|
|
}
|
|
|
|
// Determine if pixel could be a spot
|
|
// params: spot finding parameters
|
|
// val: pixel value
|
|
// sum: window sum
|
|
// sum2: window sum of squares
|
|
// count: window valid pixels count
|
|
// return the pixel result: 0-no spot / 1-spot candidate
|
|
__device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, const int64_t val, int64_t sum, int64_t sum2, int64_t count)
|
|
{
|
|
sum -= val;
|
|
sum2 -= val * val;
|
|
count -= 1;
|
|
|
|
const int64_t var = count * sum2 - (sum * sum); // This should be divided by ((2*NBX+1) * (2*NBY+1)-1)*((2*NBX+1) * (2*NBY+1))
|
|
const int64_t in_minus_mean = val * count - sum; // Should be divided by ((2*NBX+1) * (2*NBY+1));
|
|
|
|
const int64_t tmp1 = in_minus_mean * in_minus_mean * (count-1);
|
|
const float tmp2 = (var * count) * params.strong_pixel_threshold2;
|
|
const bool strong_pixel = (val >= params.count_threshold) & (in_minus_mean > 0) & (tmp1 > tmp2);
|
|
|
|
return strong_pixel ? 1 : 0;
|
|
}
|
|
|
|
// Find pixels that could be spots
|
|
// in: image input values
|
|
// out: pixel result byte array, 1 byte per pixel (0:no/1:candidate spot)
|
|
// params: spot finding parameters
|
|
//
|
|
// The algorithm uses multiple waves (blockDim.y) that run over sections of rows.
|
|
// Each wave will write output at the back row and read input at the front row.
|
|
// Each wave is split into column output sections (blockDim.x)
|
|
// A wave section (block) is responsible for a particular row/column section and
|
|
// maintains sum/sum2/count values per column for the output row.
|
|
// Every cuda thread is associated with a particular column. The thread maintains
|
|
// the sum/sum2/count values in shared memory for it's column. To do this, the input
|
|
// pixel values for the hight of the aggregation window are saved in shared memory.
|
|
__global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parameters params)
|
|
{
|
|
// assumption: 2 * params.nby + 1 <= params.rows and 2 * params.nbx + 1 <= params.cols
|
|
const int32_t window = 2 * (int)params.nby + 1; // vertical window
|
|
|
|
const int32_t writeSize = blockDim.x - 2 * params.nbx; // output columns per block
|
|
const int32_t cmin = blockIdx.x * writeSize; // lowest output column
|
|
const int32_t cmax = min(cmin + writeSize, static_cast<int32_t>(params.cols)); // past highest output column
|
|
const int32_t col = cmin + threadIdx.x - params.nbx; // thread -> column mapping
|
|
const bool data_col = (col >= 0) & (col < static_cast<int32_t>(params.cols)); // read global mem
|
|
const bool result_col = (col >= cmin) & (col < cmax); // write result
|
|
const int32_t nWaves = gridDim.y; // number of waves
|
|
const int32_t rowsPerWave = (params.lines + nWaves - 1) / nWaves; // rows per wave
|
|
const int32_t rmin = blockIdx.y * rowsPerWave; // lowest result row for this wave
|
|
const int32_t rmax = min(rmin + rowsPerWave, static_cast<int32_t>(params.lines)); // past highest result row for this wave
|
|
|
|
const int32_t left = max(static_cast<int32_t>(threadIdx.x) - static_cast<int32_t>(params.nbx), 0); // leftmost column touched by this thread
|
|
const int32_t right = min(static_cast<int32_t>(threadIdx.x) + static_cast<int32_t>(params.nbx) + 1, static_cast<int32_t>(params.cols)); // past rightmost column touched by this thread
|
|
|
|
int32_t back = rmin; // back of wave for writing
|
|
int32_t front = max(back - static_cast<int32_t>(params.nby), 0); // front of wave for reading (needs to overtake back initially)
|
|
|
|
extern __shared__ int32_t shared_mem[];
|
|
int32_t* shared_sum = shared_mem; // shared buffer [blockDim.x]
|
|
int32_t* shared_sum2 = &shared_sum[blockDim.x]; // shared buffer [blockDim.x]
|
|
int16_t* shared_count = reinterpret_cast<int16_t*>(&shared_sum2[blockDim.x]); // shared buffer [blockDim.x]
|
|
int16_t* shared_val = &shared_count[blockDim.x]; // shared cyclic buffer [window, blockDim.x]
|
|
|
|
int32_t total_sum; // totals
|
|
int32_t total_sum2;
|
|
int32_t total_count;
|
|
|
|
// initialize sum, sum2, count, val buffers
|
|
const int16_t ini = params.min_viable_number - 1; // value that is NOT counted
|
|
shared_sum[threadIdx.x] = 0; // shared values without effect on totals
|
|
shared_sum2[threadIdx.x] = 0;
|
|
shared_count[threadIdx.x] = 0;
|
|
for (int i=0; i<window; i++)
|
|
shared_val[i * blockDim.x + threadIdx.x] = ini;
|
|
|
|
// wave front up to rmin + nby + 1
|
|
do {
|
|
if (data_col) { // read at the front end of the wave
|
|
const int16_t val = in[front * params.cols + col];
|
|
shared_val[(front % window) * blockDim.x + threadIdx.x] = val;
|
|
if (val >= params.min_viable_number) {
|
|
shared_sum[threadIdx.x] += val;
|
|
shared_sum2[threadIdx.x] += val * val;
|
|
shared_count[threadIdx.x] += 1;
|
|
}
|
|
}
|
|
front++;
|
|
} while (front < rmin + static_cast<int32_t>(params.nby) + 1);
|
|
// wave front up to rmax
|
|
do {
|
|
__syncthreads(); // make others see the shared values
|
|
if (result_col) { // write at the back end of the wave
|
|
total_sum = total_sum2 = total_count = 0;
|
|
for (auto j = left; j < right; j++) {
|
|
total_sum += shared_sum[j];
|
|
total_sum2 += shared_sum2[j];
|
|
total_count += shared_count[j];
|
|
}
|
|
out[back * params.cols + col] = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
|
|
}
|
|
back++;
|
|
__syncthreads(); // keep shared values until others have seen them
|
|
if (data_col) { // read at the front end of the wave
|
|
int16_t cnt = 0;
|
|
int16_t old = shared_val[(front % window) * blockDim.x + threadIdx.x];
|
|
if (old < params.min_viable_number) {
|
|
old = 0; // no effect value
|
|
cnt = 1; // bring count to normal
|
|
}
|
|
int16_t val = in[front * params.cols + col];
|
|
shared_val[(front % window) * blockDim.x + threadIdx.x] = val;
|
|
if (val < params.min_viable_number) {
|
|
val = 0; // no effect value
|
|
cnt -= 1; // count diff from normal
|
|
}
|
|
shared_sum[threadIdx.x] += val - old;
|
|
shared_sum2[threadIdx.x] += val * val - old * old;
|
|
shared_count[threadIdx.x] += cnt;
|
|
}
|
|
front++;
|
|
} while (front < rmax);
|
|
// wave back up to rmax
|
|
do {
|
|
__syncthreads(); // make others see the shared values
|
|
if (result_col) { // write at the back end of the wave
|
|
total_sum = total_sum2 = total_count = 0;
|
|
for (auto j = left; j < right; j++) {
|
|
total_sum += shared_sum[j];
|
|
total_sum2 += shared_sum2[j];
|
|
total_count += shared_count[j];
|
|
}
|
|
out[back * params.cols + col] = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
|
|
}
|
|
back++;
|
|
__syncthreads(); // keep shared values until others have seen them
|
|
if (data_col) { // read at the front end of the wave if possible
|
|
int16_t cnt = -1; // normal count diff
|
|
int16_t old = shared_val[(front % window) * blockDim.x + threadIdx.x];
|
|
if (old < params.min_viable_number) {
|
|
old = 0; // no effect value
|
|
cnt += 1; // bring count to normal
|
|
}
|
|
int16_t val = 0;
|
|
if (front < params.lines) {
|
|
val = in[front * params.cols + col];
|
|
if (val < params.min_viable_number)
|
|
val = 0; // no effect value
|
|
else
|
|
cnt += 1; // count diff from normal
|
|
}
|
|
shared_sum[threadIdx.x] += val - old;
|
|
shared_sum2[threadIdx.x] += val * val - old * old;
|
|
shared_count[threadIdx.x] += cnt;
|
|
}
|
|
front++;
|
|
} while (back < rmax);
|
|
}
|
|
|
|
__global__ void gpu_radial_integration(const int16_t *image, const uint16_t *rad_integration_mapping,
|
|
float *corr, float *bin_sum, float *bin_count, uint32_t npixel,
|
|
uint16_t nbins) {
|
|
|
|
extern __shared__ float shared_mem_fp[];
|
|
float* shared_sum = shared_mem_fp; // shared buffer [nbins]
|
|
float* shared_count = &shared_sum[nbins]; // shared buffer [nbins]
|
|
|
|
uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
|
|
|
|
for (uint32_t i = threadIdx.x; i < 2 * nbins; i += blockDim.x)
|
|
shared_mem_fp[i] = 0;
|
|
__syncthreads();
|
|
|
|
for (uint32_t i = idx; i < npixel; i += blockDim.x * gridDim.x) {
|
|
uint16_t bin = rad_integration_mapping[i];
|
|
float value = static_cast<float>(image[i]) * corr[i];
|
|
if ((image[i] > INT16_MIN + 4) && (image[i] < INT16_MAX - 4) && (bin < nbins)) {
|
|
atomicAdd(&shared_sum[bin], value);
|
|
atomicAdd(&shared_count[bin], 1.0f);
|
|
}
|
|
}
|
|
__syncthreads();
|
|
|
|
for (uint32_t i = threadIdx.x; i < nbins; i += blockDim.x) {
|
|
atomicAdd(&bin_sum[i], shared_sum[i]);
|
|
atomicAdd(&bin_count[i], shared_count[i]);
|
|
}
|
|
}
|
|
|
|
__global__ void spot_finder_reduce(uint32_t *output, uint32_t* counter, const uint8_t* input,
|
|
uint32_t npixel, uint32_t output_size) {
|
|
uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
|
|
if (idx < npixel) {
|
|
if (input[idx]) {
|
|
auto old_counter = atomicAdd(counter, 1);
|
|
if (old_counter < output_size)
|
|
output[old_counter] = idx;
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void apply_pixel_mask(int16_t *image, const uint8_t *mask, uint32_t npixel) {
|
|
uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
|
|
if (idx < npixel) {
|
|
if (mask[idx] == 0)
|
|
image[idx] = INT16_MIN;
|
|
}
|
|
}
|
|
|
|
GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels, const std::vector<uint8_t> &mask) :
|
|
xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr), rad_integration_nbins(0), numberOfSMs(1) {
|
|
|
|
if (get_gpu_count() == 0)
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, "No CUDA devices found");
|
|
|
|
int deviceId;
|
|
cuda_err(cudaGetDevice(&deviceId));
|
|
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
|
|
{
|
|
int warp_size;
|
|
cuda_err(cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, deviceId));
|
|
}
|
|
|
|
cudastream = new(CudaStreamWrapper);
|
|
|
|
cuda_err(cudaStreamCreate(&cudastream->v));
|
|
cuda_err(cudaMalloc(&gpu_mask, xpixels * ypixels * sizeof(int8_t)));
|
|
cuda_err(cudaMalloc(&gpu_in, xpixels * ypixels * sizeof(int16_t)));
|
|
cuda_err(cudaMalloc(&gpu_out, xpixels * ypixels * sizeof(int8_t)));
|
|
cuda_err(cudaMalloc(&gpu_out_reduced, maxStrongPixel * sizeof(uint32_t)));
|
|
cuda_err(cudaMalloc(&gpu_out_reduced_counter, sizeof(uint32_t)));
|
|
cuda_err(cudaHostAlloc(&host_out_reduced, maxStrongPixel * sizeof(uint32_t), cudaHostAllocPortable));
|
|
cuda_err(cudaHostAlloc(&host_out_reduced_counter, sizeof(uint32_t), cudaHostAllocPortable));
|
|
|
|
cuda_err(cudaMemsetAsync(gpu_mask, 1, xpixels*ypixels, cudastream->v));
|
|
|
|
if (mask.size() != xpixels * ypixels)
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Mismatch in mask size");
|
|
cudaMemcpy(gpu_mask, mask.data(), xpixels*ypixels, cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
GPUImageAnalysis::GPUImageAnalysis(int32_t xpixels, int32_t ypixels, const std::vector<uint8_t> &mask,
|
|
const std::vector<uint16_t> &rad_int_mapping, uint16_t rad_int_nbins)
|
|
: GPUImageAnalysis(xpixels, ypixels, mask) {
|
|
rad_integration_nbins = rad_int_nbins;
|
|
|
|
if (rad_int_mapping.size() != xpixels * ypixels)
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Mismatch in rad. int. mapping size");
|
|
|
|
if (rad_integration_nbins > 0) {
|
|
cuda_err(cudaMalloc(&gpu_rad_integration_bin_map, xpixels * ypixels * sizeof(int16_t)));
|
|
cuda_err(cudaMalloc(&gpu_rad_integration_corr, xpixels * ypixels * sizeof(float)));
|
|
|
|
cuda_err(cudaMalloc(&gpu_rad_integration_count, rad_integration_nbins * sizeof(float)));
|
|
cuda_err(cudaMalloc(&gpu_rad_integration_sum, rad_integration_nbins * sizeof(float)));
|
|
cuda_err(cudaHostAlloc(&host_rad_integration_count, rad_integration_nbins * sizeof(float), cudaHostAllocPortable));
|
|
cuda_err(cudaHostAlloc(&host_rad_integration_sum, rad_integration_nbins * sizeof(float), cudaHostAllocPortable));
|
|
|
|
cudaMemcpy(gpu_rad_integration_bin_map, rad_int_mapping.data(),
|
|
xpixels*ypixels * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
|
|
|
std::vector<float> corr(xpixels * ypixels, 1.0f);
|
|
cudaMemcpy(gpu_rad_integration_corr, corr.data(),
|
|
xpixels * ypixels * sizeof(float), cudaMemcpyHostToDevice);
|
|
}
|
|
}
|
|
|
|
GPUImageAnalysis::GPUImageAnalysis(int32_t xpixels, int32_t ypixels, const std::vector<uint8_t> &mask,
|
|
const RadialIntegrationMapping& mapping)
|
|
: GPUImageAnalysis(xpixels, ypixels, mask, mapping.GetPixelToBinMapping(),mapping.GetBinNumber()) {}
|
|
|
|
GPUImageAnalysis::~GPUImageAnalysis() {
|
|
cudaStreamDestroy(cudastream->v);
|
|
delete(cudastream);
|
|
|
|
if (rad_integration_nbins > 0) {
|
|
cudaFree(gpu_rad_integration_bin_map);
|
|
cudaFree(gpu_rad_integration_count);
|
|
cudaFree(gpu_rad_integration_sum);
|
|
cudaFree(gpu_rad_integration_corr);
|
|
cudaFreeHost(host_rad_integration_count);
|
|
cudaFreeHost(host_rad_integration_sum);
|
|
}
|
|
|
|
cudaFreeHost(host_out_reduced);
|
|
cudaFreeHost(host_out_reduced_counter);
|
|
cudaFree(gpu_mask);
|
|
cudaFree(gpu_in);
|
|
cudaFree(gpu_out);
|
|
cudaFree(gpu_out_reduced);
|
|
cudaFree(gpu_out_reduced_counter);
|
|
}
|
|
|
|
void GPUImageAnalysis::SetInputBuffer(void *ptr) {
|
|
host_in = (int16_t *) ptr;
|
|
}
|
|
|
|
bool GPUImageAnalysis::GPUPresent() {
|
|
int device_count;
|
|
cuda_err(cudaGetDeviceCount(&device_count));
|
|
|
|
return (device_count > 0);
|
|
}
|
|
|
|
void GPUImageAnalysis::RunSpotFinder(const JFJochProtoBuf::DataProcessingSettings &settings) {
|
|
// data_in is CUDA registered memory
|
|
|
|
// Run COLSPOT (GPU version)
|
|
spot_parameters spot_params;
|
|
spot_params.strong_pixel_threshold2 = settings.signal_to_noise_threshold() * settings.signal_to_noise_threshold();
|
|
spot_params.nbx = settings.local_bkg_size();
|
|
spot_params.nby = settings.local_bkg_size();
|
|
spot_params.lines = ypixels;
|
|
spot_params.cols = xpixels;
|
|
spot_params.count_threshold = settings.photon_count_threshold();
|
|
spot_params.min_viable_number = INT16_MIN + 5;
|
|
|
|
if (2 * spot_params.nbx + 1 > windowSizeLimit)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nbx exceeds window size limit");
|
|
if (2 * spot_params.nby + 1 > windowSizeLimit)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nby exceeds window size limit");
|
|
if (windowSizeLimit > numberOfCudaThreads)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of cuda threads");
|
|
if (windowSizeLimit > spot_params.cols)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of columns");
|
|
if (windowSizeLimit > spot_params.lines)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of lines");
|
|
|
|
{ // call cuda kernel
|
|
const auto nWriters = numberOfCudaThreads - 2 * spot_params.nby;
|
|
const auto nBlocks = (spot_params.cols + nWriters - 1) / nWriters;
|
|
const auto window = 2 * spot_params.nby + 1;
|
|
const auto sharedSize = (2 * sizeof(int32_t) + // sum, sum2
|
|
(1 + window) * sizeof(int16_t) // count, val
|
|
) * numberOfCudaThreads;
|
|
const dim3 blocks(nBlocks, numberOfWaves);
|
|
|
|
cuda_err(cudaMemsetAsync(gpu_out, 0, xpixels * ypixels * sizeof(uint8_t), cudastream->v));
|
|
analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, cudastream->v>>>
|
|
(gpu_in, gpu_out, spot_params);
|
|
}
|
|
|
|
{
|
|
cuda_err(cudaMemsetAsync(gpu_out_reduced_counter, 0, sizeof(uint32_t), cudastream->v));
|
|
const auto nblocks =
|
|
xpixels * ypixels / numberOfCudaThreads + ((xpixels * ypixels % numberOfCudaThreads == 0) ? 0 : 1);
|
|
|
|
spot_finder_reduce<<<nblocks, numberOfCudaThreads, 0, cudastream->v>>>(gpu_out_reduced, gpu_out_reduced_counter,
|
|
gpu_out, xpixels*ypixels, maxStrongPixel);
|
|
}
|
|
cuda_err(cudaMemcpyAsync(host_out_reduced, gpu_out_reduced, maxStrongPixel * sizeof(uint32_t),
|
|
cudaMemcpyDeviceToHost,cudastream->v));
|
|
|
|
cuda_err(cudaMemcpyAsync(host_out_reduced_counter, gpu_out_reduced_counter, sizeof(uint32_t),
|
|
cudaMemcpyDeviceToHost,cudastream->v));
|
|
}
|
|
|
|
void GPUImageAnalysis::GetSpotFinderResults(StrongPixelSet &pixel_set) {
|
|
if (host_in == nullptr)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Host/GPU buffer not defined");
|
|
|
|
cuda_err(cudaStreamSynchronize(cudastream->v));
|
|
|
|
for (int i = 0; i < std::min<uint32_t>(maxStrongPixel, *host_out_reduced_counter); i++) {
|
|
size_t npixel = host_out_reduced[i];
|
|
size_t line = npixel / xpixels;
|
|
size_t col = npixel % xpixels;
|
|
pixel_set.AddStrongPixel(col, line, host_in[npixel]);
|
|
}
|
|
}
|
|
|
|
void GPUImageAnalysis::GetSpotFinderResults(const DiffractionExperiment &experiment,
|
|
const JFJochProtoBuf::DataProcessingSettings &settings,
|
|
std::vector<DiffractionSpot> &vec) {
|
|
StrongPixelSet pixel_set(experiment);
|
|
GetSpotFinderResults(pixel_set);
|
|
pixel_set.FindSpots(experiment, settings, vec);
|
|
}
|
|
|
|
void GPUImageAnalysis::RegisterBuffer() {
|
|
cudaHostRegister(host_in, xpixels * ypixels * sizeof(uint16_t), cudaHostRegisterDefault);
|
|
}
|
|
|
|
void GPUImageAnalysis::UnregisterBuffer() {
|
|
cudaHostUnregister(host_in);
|
|
}
|
|
|
|
void GPUImageAnalysis::LoadDataToGPU(bool apply_pixel_mask_on_gpu) {
|
|
if (host_in == nullptr)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Host/GPU buffer not defined");
|
|
|
|
cuda_err(cudaMemcpy(gpu_in, host_in, xpixels * ypixels * sizeof(int16_t), cudaMemcpyHostToDevice));
|
|
|
|
if (apply_pixel_mask_on_gpu) {
|
|
const auto nblocks =
|
|
xpixels * ypixels / numberOfCudaThreads + ((xpixels * ypixels % numberOfCudaThreads == 0) ? 0 : 1);
|
|
apply_pixel_mask<<<nblocks, numberOfCudaThreads, 0, cudastream->v>>>(gpu_in, gpu_mask, xpixels * ypixels);
|
|
}
|
|
}
|
|
|
|
void GPUImageAnalysis::LoadRadialIntegrationCorr(const std::vector<float>& v) {
|
|
if (rad_integration_nbins == 0)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Radial integration not initialized");
|
|
if (v.size() != xpixels * ypixels)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Mismatch in correction input size");
|
|
|
|
cudaMemcpy(gpu_rad_integration_corr, v.data(), xpixels * ypixels * sizeof(float), cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
void GPUImageAnalysis::RunRadialIntegration() {
|
|
if (rad_integration_nbins == 0)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Radial integration not initialized");
|
|
|
|
cuda_err(cudaMemsetAsync(gpu_rad_integration_sum, 0, rad_integration_nbins * sizeof(int32_t), cudastream->v));
|
|
cuda_err(cudaMemsetAsync(gpu_rad_integration_count, 0, rad_integration_nbins * sizeof(int32_t), cudastream->v));
|
|
|
|
gpu_radial_integration<<<numberOfSMs, numberOfCudaThreads, rad_integration_nbins * sizeof(uint32_t) * 2, cudastream->v>>>(
|
|
gpu_in, gpu_rad_integration_bin_map, gpu_rad_integration_corr,
|
|
gpu_rad_integration_sum, gpu_rad_integration_count, xpixels*ypixels,
|
|
rad_integration_nbins);
|
|
|
|
cuda_err(cudaMemcpyAsync(host_rad_integration_count, gpu_rad_integration_count,
|
|
rad_integration_nbins * sizeof(int32_t),
|
|
cudaMemcpyDeviceToHost, cudastream->v));
|
|
cuda_err(cudaMemcpyAsync(host_rad_integration_sum, gpu_rad_integration_sum,
|
|
rad_integration_nbins * sizeof(int32_t),
|
|
cudaMemcpyDeviceToHost, cudastream->v));
|
|
}
|
|
|
|
void GPUImageAnalysis::GetRadialIntegrationProfile(std::vector<float> &result) {
|
|
if (rad_integration_nbins == 0)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Radial integration not initialized");
|
|
|
|
cuda_err(cudaStreamSynchronize(cudastream->v));
|
|
|
|
result.resize(rad_integration_nbins);
|
|
|
|
for (int i = 0; i < rad_integration_nbins; i++) {
|
|
if (host_rad_integration_count[i] > 0)
|
|
result[i] = static_cast<float>(host_rad_integration_sum[i])
|
|
/ static_cast<float>(host_rad_integration_count[i]);
|
|
else
|
|
result[i] = 0;
|
|
}
|
|
}
|
|
|
|
std::vector<float> GPUImageAnalysis::GetRadialIntegrationSum() const {
|
|
if (rad_integration_nbins == 0)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Radial integration not initialized");
|
|
|
|
cuda_err(cudaStreamSynchronize(cudastream->v));
|
|
|
|
std::vector<float> out(rad_integration_nbins);
|
|
memcpy(out.data(), host_rad_integration_sum, rad_integration_nbins * sizeof(int32_t));
|
|
return out;
|
|
}
|
|
|
|
std::vector<float> GPUImageAnalysis::GetRadialIntegrationCount() const {
|
|
if (rad_integration_nbins == 0)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Radial integration not initialized");
|
|
|
|
cuda_err(cudaStreamSynchronize(cudastream->v));
|
|
|
|
std::vector<float> out(rad_integration_nbins);
|
|
memcpy(out.data(), host_rad_integration_count, rad_integration_nbins * sizeof(int32_t));
|
|
return out;
|
|
}
|
|
|
|
float GPUImageAnalysis::GetRadialIntegrationRangeValue(uint16_t min_bin, uint16_t max_bin) {
|
|
if (rad_integration_nbins == 0)
|
|
throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Radial integration not initialized");
|
|
|
|
cuda_err(cudaStreamSynchronize(cudastream->v));
|
|
|
|
float ret_sum = 0;
|
|
float ret_count = 0;
|
|
|
|
for (int i = std::min(rad_integration_nbins,min_bin);
|
|
i <= std::min((uint16_t)(rad_integration_nbins-1),max_bin);
|
|
i++) {
|
|
ret_sum += host_rad_integration_sum[i];
|
|
ret_count += host_rad_integration_count[i];
|
|
}
|
|
|
|
if (ret_count == 0)
|
|
return 0;
|
|
else
|
|
return ret_sum / ret_count;
|
|
}
|