diff --git a/src/Algorithms/CollimatorPhysics.h b/src/Algorithms/CollimatorPhysics.h index a2f19ae..a3bdbbf 100644 --- a/src/Algorithms/CollimatorPhysics.h +++ b/src/Algorithms/CollimatorPhysics.h @@ -33,6 +33,10 @@ public: virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, double dt, double c, bool usedt = false, int streamId = -1) = 0; + virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, + void *bf_ptr, void *dt_ptr, double charge, + double mass, int npart, double c, int streamId = -1) = 0; + virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr, double dt, double c, bool usedt = false, diff --git a/src/CUDA/CudaCollimatorPhysics.cu b/src/CUDA/CudaCollimatorPhysics.cu index 495ad59..1b52fcc 100644 --- a/src/CUDA/CudaCollimatorPhysics.cu +++ b/src/CUDA/CudaCollimatorPhysics.cu @@ -33,6 +33,14 @@ __device__ inline double dot(double3 &d1, double3 &d2) { } +__device__ inline double3 cross(double3 &lhs, double3 &rhs) { + double3 tmp; + tmp.x = lhs.y * rhs.z - lhs.z * rhs.y; + tmp.y = lhs.z * rhs.x - lhs.x * rhs.z; + tmp.z = lhs.x * rhs.y - lhs.y * rhs.x; + return tmp; +} + __device__ inline bool checkHit(double &z, double *par) { /* check if particle is in the degrader material */ @@ -423,7 +431,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) { } -__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) { +__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) { //get global id and thread id volatile int tid = threadIdx.x; @@ -449,7 +457,51 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, dou } } -//TODO: kernel for push with switch off unitless positions with dt[i]*c +__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, + double3 *gBf, double *gdt, double charge, + double mass, int npart, double c) +{ + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + if (idx < npart) { + double3 R = gR[idx]; + double3 P = gP[idx]; + double3 Ef = gEf[idx]; + double3 Bf = gBf[idx]; + double dt = gdt[idx]; + + P.x += 0.5 * dt * charge * c / mass * Ef.x; + P.y += 0.5 * dt * charge * c / mass * Ef.y; + P.z += 0.5 * dt * charge * c / mass * Ef.z; + + double gamma = sqrt(1.0 + dot(P, P)); + double3 t, w, s; + t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x; + t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y; + t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z; + + double3 crossPt = cross(P, t); + w.x = P.x + crossPt.x; + w.y = P.y + crossPt.y; + w.z = P.z + crossPt.z; + + s.x = 2.0 / (1.0 + dot(t, t)) * t.x; + s.y = 2.0 / (1.0 + dot(t, t)) * t.y; + s.z = 2.0 / (1.0 + dot(t, t)) * t.z; + + double3 crossws = cross(w, s); + P.x += crossws.x; + P.y += crossws.y; + P.z += crossws.z; + + P.x += 0.5 * dt * charge * c / mass * Ef.x; + P.y += 0.5 * dt * charge * c / mass * Ef.y; + P.z += 0.5 * dt * charge * c / mass * Ef.z; + + gP[idx] = P; + } +} __device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) { @@ -671,12 +723,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np } } else { if (streamId == -1) { - kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, - (double*)dt_ptr, c); + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, + (double*)dt_ptr, npart, c); } else { cudaStream_t cs = m_base->cuda_getStream(streamId); - kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, - (double*)dt_ptr, c); + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, + (double*)dt_ptr, npart, c); } } @@ -684,6 +736,29 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np return DKS_SUCCESS; } +int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, + void *bf_ptr, void *dt_ptr, double charge, + double mass, int npart, + double c, int streamId) +{ + + int threads = BLOCK_SIZE; + int blocks = npart / threads + 1; + + //call kernel + if (streamId == -1) { + kernelKick<<>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr, + (double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelKick<<>>((double3*)r_ptr, (double3*)p_ptr, + (double3*)ef_ptr, (double3*)bf_ptr, + (double*)dt_ptr, charge, mass, npart, c); + } + + return DKS_SUCCESS; +} + int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, void *orient_ptr, int npart, int nsec, diff --git a/src/CUDA/CudaCollimatorPhysics.cuh b/src/CUDA/CudaCollimatorPhysics.cuh index 9808f33..01d4663 100644 --- a/src/CUDA/CudaCollimatorPhysics.cuh +++ b/src/CUDA/CudaCollimatorPhysics.cuh @@ -141,6 +141,10 @@ public: int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, double dt, double c, bool usedt = false, int streamId = -1); + int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, + void *bf_ptr, void *dt_ptr, double charge, double mass, + int npart, double c, int streamId = -1); + /** BorisPusher push function with transformto function form OPAL * ParallelTTracker integration from OPAL implemented in cuda. * For more details see ParallelTTracler docomentation in opal diff --git a/src/DKSOPAL.cpp b/src/DKSOPAL.cpp index 111c3ba..789a4e7 100644 --- a/src/DKSOPAL.cpp +++ b/src/DKSOPAL.cpp @@ -6,6 +6,11 @@ DKSOPAL::DKSOPAL() { dksgreens = nullptr; } +DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) { + setAPI(api_name, strlen(api_name)); + setDevice(device_name, strlen(device_name)); +} + DKSOPAL::~DKSOPAL() { delete dksfft; delete dkscol; @@ -14,7 +19,6 @@ DKSOPAL::~DKSOPAL() { int DKSOPAL::setupOPAL() { int ierr = DKS_ERROR; - if (apiOpenCL()) { ierr = OPENCL_SAFECALL( DKS_SUCCESS ); //TODO: only enable if AMD libraries are available @@ -40,9 +44,10 @@ int DKSOPAL::setupOPAL() { int DKSOPAL::initDevice() { int ierr = setupDevice(); - if (ierr == DKS_ERROR) + if (ierr == DKS_SUCCESS) ierr = setupOPAL(); return ierr; + } /* setup fft plans to reuse if multiple ffts of same size are needed */ @@ -237,7 +242,6 @@ int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) { - return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback); } @@ -265,6 +269,23 @@ int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, } +int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, + int npart, double c, int streamId) { + + return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId); + +} + +int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, + void *bf_ptr, void *dt_ptr, double charge, double mass, + int npart, double c, int streamId) +{ + + return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr, + charge, mass, npart, c, streamId); + +} + int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr, double dt, diff --git a/src/DKSOPAL.h b/src/DKSOPAL.h index f182c1d..e717505 100644 --- a/src/DKSOPAL.h +++ b/src/DKSOPAL.h @@ -46,6 +46,8 @@ public: DKSOPAL(); + DKSOPAL(const char* api_name, const char* device_name); + ~DKSOPAL(); int initDevice(); @@ -210,6 +212,20 @@ public: int npart, int nsec, void *dt_ptr, double dt, double c, bool usedt = false, int streamId = -1); + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, + int npart, double c, int streamId = -1); + + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, + void *bf_ptr, void *dt_ptr, double charge, + double mass, int npart, double c, int streamId = -1); };