Files
DKS/src/CUDA/CudaCollimatorPhysics.cu

827 lines
20 KiB
Plaintext

#include "CudaCollimatorPhysics.cuh"
//#define M_P 0.93827231e+00
#define M_P 0.93827204e+00
#define C 299792458.0
#define PI 3.14159265358979323846
#define AVO 6.022e23
#define R_E 2.81794092e-15
//#define eM_E 0.51099906e-03
#define eM_E 0.51099892e-03
#define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
#define POSITION 0
#define ZSIZE 1
#define RHO_M 2
#define Z_M 3
#define A_M 4
#define A2_C 5
#define A3_C 6
#define A4_C 7
#define A5_C 8
#define X0_M 9
#define I_M 10
#define DT_M 11
#define LOWENERGY_THR 12
#define BLOCK_SIZE 128
#define NUMPAR 13
__device__ inline double dot(double3 &d1, double3 &d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
}
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
double3 tmp;
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
return tmp;
}
__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
double c=cos(Theta);
double s=sin(Theta);
double dotW = sqrt(dot(W,W));
W.x = W.x / dotW;
W.y = W.y / dotW;
W.z = W.z / dotW;
double dotWR = dot(W, Rorg) * (1.0 - c);
double3 crossW = cross(W, Rorg);
double3 tmp;
tmp.x = Rorg.x * c + crossW.x * s + W.x * dotWR;
tmp.y = Rorg.y * c + crossW.y * s + W.y * dotWR;
tmp.z = Rorg.z * c + crossW.z * s + W.z * dotWR;
return tmp;
}
__device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
}
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
{
volatile double dEdx = 0.0;
volatile double gamma = (Eng + M_P) / M_P;
volatile double gamma2 = gamma * gamma;
double beta = sqrt(1.0 - 1.0 / gamma2);
volatile double beta2 = beta * beta;
double deltas = par[DT_M] * beta * C;
volatile double deltasrho = deltas * 100 * par[RHO_M];
volatile double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
double Ts = (Eng * 1E6) / 1.0073;
double epsilon_low = par[A2_C] * pow(Ts, 0.45);
double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
Eng = Eng + delta_E / 1E3;
}
if (Eng >= 0.0006) {
double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
Tmax / par[I_M] / par[I_M]) - beta2);
double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
Eng = Eng + delta_E / 1E3;
}
pdead = ( (Eng < par[LOWENERGY_THR]) || (dEdx > 0) );
}
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
double &normP, double &thetacou, double &deltas, int coord,
double *par)
{
double Psixz;
double pxz;
/*
if (px>=0 && pz>=0)
Psixz = atan(px/pz);
else if (px>0 && pz<0)
Psixz = atan(px/pz) + PI;
else if (px<0 && pz>0)
Psixz = atan(px/pz) + 2*PI;
else
Psixz = atan(px/pz) + PI;
*/
Psixz = atan2(px, pz);
pxz = sqrt(px*px + pz*pz);
if(coord==1) {
x = x + deltas * px/normP + xplane*cos(Psixz);
z = z - xplane * sin(Psixz);
}
if(coord==2) {
x = x + deltas * px/normP + xplane*cos(Psixz);
z = z - xplane * sin(Psixz) + deltas * pz / normP;
}
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
bool enableRutherfordScattering)
{
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P;
double normP = sqrt(dot(P, P));
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
double deltas = par[DT_M] * beta * C;
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
// x-direction: See Physical Review, "Multiple Scattering"
double z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
double z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
double thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
}
//__syncthreads();
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
// y-direction: See Physical Review, "Multiple Scattering"
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
}
//__syncthreads();
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if( (P2 < 0.0047) && enableRutherfordScattering) {
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
//double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double thetaru = 2.5 * sqrt(1 / P3) * 2.0 * theta0;
double phiru = 2.0 * M_PI * curand_uniform_double(&state);
double th0=atan2(sqrt(P.x*P.x+P.y*P.y),fabs(P.z));
double3 W,X;
double dotP = sqrt(dot(P,P));
X.x = cos(phiru)*sin(thetaru) * dotP;
X.y = sin(phiru)*sin(thetaru) * dotP;
X.z = cos(thetaru) * dotP;
W.x = -P.y;
W.y = P.x;
W.z = 0.0;
P = ArbitraryRotation(W, X, th0);
}
}
template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
int numparticles, bool enableRutherfordScattering)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
//transfer params to shared memory
extern __shared__ double smem[];
double *p = (double*)smem;
double3 *R = (double3*)&smem[NUMPAR];
curandState s;
double3 P;
for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
p[tt] = par[tt];
__syncthreads();
if (idx < numparticles) {
s = state[idx];
R[tid] = data[idx].Rincol;
P = data[idx].Pincol;
bool pdead = false;
volatile double sq = sqrt(1.0 + dot(P, P));
double Eng;
if (checkHit(R[tid].z, p)) {
Eng = (sq - 1) * M_P;
energyLoss(Eng, pdead, s, p);
if (!pdead) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(P, P));
P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
data[idx].Pincol = P;
} else {
data[idx].label = -1;
}
state[idx] = s;
} else {
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
data[idx].label = -2;
}
data[idx].Rincol = R[tid];
}
}
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
curandState *state, int numparticles,
bool enableRutherfordScattering)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
//transfer params to shared memory
__shared__ double p[NUMPAR];
__shared__ double3 R[BLOCK_SIZE];
if (tid < NUMPAR)
p[tid] = par[tid];
__syncthreads();
curandState s;
double3 P;
if (idx < numparticles) {
R[tid] = data.Rincol[idx];
P = data.Pincol[idx];
s = state[idx];
double sq = sqrt(1.0 + dot(P, P));
bool pdead = false;
if (checkHit(R[tid].z, p)) {
double Eng = (sq - 1) * M_P;
energyLoss(Eng, pdead, s, p);
if (!pdead) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(P, P));
P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
data.Pincol[idx] = P;
} else {
data.label[idx] = -1;
}
} else {
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
data.label[idx] = -2;
}
data.Rincol[idx] = R[tid];
state[idx] = s;
}
}
inline __device__ void unitlessOff(double3 &a, const double &c) {
a.x *= c;
a.y *= c;
a.z *= c;
}
inline __device__ void unitlessOn(double3 &a, const double &c) {
a.x /= c;
a.y /= c;
a.z /= c;
}
//swithch to unitless positions with dtc
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOn(R, dtc);
unitlessOn(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc to unitless positions with dt*c
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc off unitless positions with dtc
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOff(R, dtc);
unitlessOff(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//switch off unitelss positions with dt*c
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
//switch to unitless positions
unitlessOn(R, dtc);
//push
double tmp = sqrt(1.0 + dot(P, P));
R.x += 0.5 * P.x / tmp;
R.y += 0.5 * P.y / tmp;
R.z += 0.5 * P.z / tmp;
//switch off unitless positions with dt*c
unitlessOff(R, dtc);
gR[idx] = R;
}
}
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
double dt = gdt[idx];
//switch to unitless positions with dt*c
unitlessOn(R, dt*c);
R.x += 0.5 * P.x / sqrt(1.0 + dot(P, P));
R.y += 0.5 * P.y / sqrt(1.0 + dot(P, P));
R.z += 0.5 * P.z / sqrt(1.0 + dot(P, P));
//switch off unitless positions with dt*c
unitlessOff(R, dt*c);
gR[idx] = R;
}
}
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
double3 *gBf, double *gdt, double charge,
double mass, int npart, double c)
{
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
double3 Ef = gEf[idx];
double3 Bf = gBf[idx];
double dt = gdt[idx];
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
double gamma = sqrt(1.0 + dot(P, P));
double3 t, w, s;
t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
double3 crossPt = cross(P, t);
w.x = P.x + crossPt.x;
w.y = P.y + crossPt.y;
w.z = P.z + crossPt.z;
s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
double3 crossws = cross(w, s);
P.x += crossws.x;
P.y += crossws.y;
P.z += crossws.z;
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
gP[idx] = P;
}
}
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
const double sina = sin(ori.x);
const double cosa = cos(ori.x);
const double sinb = sin(ori.y);
const double cosb = cos(ori.y);
const double sinc = sin(ori.z);
const double cosc = cos(ori.z);
double3 temp;
temp.x = 0.0;
temp.y = 0.0;
temp.z = 0.0;
temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x +
(cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x +
(sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
return temp;
}
__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
int npart, int nsect, double dtc)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 X = gX[idx];
double3 P = gP[idx];
long lLastSection = gLastSection[idx];
double3 ori;
if (lLastSection > -1 && lLastSection < nsect) {
ori = gOrient[lLastSection];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
double3 tmp = deviceTransformTo(P, ori);
unitlessOn(X, dtc);
X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
unitlessOff(X, dtc);
gX[idx] = X;
}
}
__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
int npart, int nsect, double *gdt, double c)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 X = gX[idx];
double3 P = gP[idx];
long lLastSection = gLastSection[idx];
double dt = gdt[idx];
double3 ori;
if (lLastSection > -1 && lLastSection < nsect) {
ori = gOrient[lLastSection];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
double3 tmp = deviceTransformTo(P, ori);
unitlessOn(X, dt*c);
X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
unitlessOff(X, dt*c);
gX[idx] = X;
}
}
struct compare_particle
{
int threshold;
compare_particle() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART p1, CUDA_PART p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART p1) {
return p1.label < threshold;
}
};
struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
struct less_then
{
__host__ __device__
bool operator()(int x)
{
return x < 0;
}
};
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering)
{
int threads = BLOCK_SIZE;
int blocks = numparticles / threads + 1;
//calc shared memory size
int smem_size = sizeof(double)*NUMPAR + sizeof(double3)*BLOCK_SIZE;
//call kernel
kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
(double*)par_ptr,
m_base->cuda_getCurandStates(),
numparticles,
enableRutherfordScattering);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
std::cout << "Err2: " << cudaGetErrorString(err) << std::endl;
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
int &numaddback)
{
//wrap mem_ptr with thrust device ptr
thrust::device_ptr<CUDA_PART_SMALL> dev_ptr( (CUDA_PART_SMALL*)mem_ptr);
//count -2 and -1 particles
compare_particle_small comp;
comp.set_threshold(0);
numaddback = thrust::count_if(dev_ptr, dev_ptr + numparticles, comp);
//sort particles
if (numaddback > 0)
thrust::sort(dev_ptr, dev_ptr + numparticles, comp);
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c, bool usedt,
int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
//call kernel
if (!usedt) {
if (streamId == -1) {
kernelPush<<<blocks, threads >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
}
} else {
if (streamId == -1) {
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr,
(double*)dt_ptr, npart, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
(double*)dt_ptr, npart, c);
}
}
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart,
double c, int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
//call kernel
if (streamId == -1) {
kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
(double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
(double3*)ef_ptr, (double3*)bf_ptr,
(double*)dt_ptr, charge, mass, npart, c);
}
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec,
void *dt_ptr, double dt,
double c, bool usedt,
int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
int smem = sizeof(double3) * nsec;
//call kernel
if (!usedt) {
if (streamId == -1) {
kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, dt*c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, dt*c);
}
} else {
if (streamId == -1) {
kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, (double*)dt_ptr, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, (double*)dt_ptr, c);
}
}
return DKS_SUCCESS;
}