DKS/src/CUDA/CudaCollimatorPhysics.cu

#include "CudaCollimatorPhysics.cuh"

//#define M_P 0.93827231e+00
#define M_P 0.93827204e+00
#define C 299792458.0
#define PI 3.14159265358979323846
#define AVO 6.022e23
#define R_E 2.81794092e-15
//#define eM_E 0.51099906e-03
#define eM_E 0.51099892e-03
#define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7

#define POSITION 0
#define ZSIZE 1
#define RHO_M 2
#define Z_M 3
#define A_M 4
#define A2_C 5
#define A3_C 6
#define A4_C 7
#define A5_C 8
#define X0_M 9
#define I_M 10
#define DT_M 11
#define LOWENERGY_THR 12

#define BLOCK_SIZE 128
#define NUMPAR 13

__device__ inline double dot(double3 &d1, double3 &d2) {

  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);

}

__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  double3 tmp;
  tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
  tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
  tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
  return tmp;
}

__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
  double c=cos(Theta);
  double s=sin(Theta);
  double dotW = sqrt(dot(W,W));
  W.x = W.x / dotW;
  W.y = W.y / dotW;
  W.z = W.z / dotW;

  double dotWR = dot(W, Rorg) * (1.0 - c);
  double3 crossW = cross(W, Rorg);
  double3 tmp;
  tmp.x = Rorg.x * c + crossW.x * s + W.x * dotWR;
  tmp.y = Rorg.y * c + crossW.y * s + W.y * dotWR;
  tmp.z = Rorg.z * c + crossW.z * s + W.z * dotWR;
  return tmp;
}

__device__ inline bool checkHit(double &z, double *par) {

  /* check if particle is in the degrader material */
  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );

}


__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
{

  volatile double dEdx = 0.0;

  volatile double gamma = (Eng + M_P) / M_P;
  volatile double gamma2 = gamma * gamma;

  double beta = sqrt(1.0 - 1.0 / gamma2);
  volatile double beta2 = beta * beta;

  double deltas = par[DT_M] * beta * C;
  volatile double deltasrho = deltas * 100 * par[RHO_M];
  volatile double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);

  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
    double Ts = (Eng * 1E6) / 1.0073;
    double epsilon_low = par[A2_C] * pow(Ts, 0.45);
    double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
    double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);

    dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );

    double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
    Eng = Eng + delta_E / 1E3;
  }

  if (Eng >= 0.0006) {
    double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
      (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));

    dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
      (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
		       Tmax / par[I_M] / par[I_M]) - beta2);

    double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);

    Eng = Eng + delta_E / 1E3;
  }

  pdead = ( (Eng < par[LOWENERGY_THR]) || (dEdx > 0) );

}

__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
			   double &normP, double &thetacou, double &deltas, int coord,
			   double *par)
{
  double Psixz;
  double pxz;

  /*
  if (px>=0 && pz>=0)
    Psixz = atan(px/pz);
  else if (px>0 && pz<0)
      Psixz = atan(px/pz) + PI;
  else if (px<0 && pz>0)
    Psixz = atan(px/pz) + 2*PI;
  else
    Psixz = atan(px/pz) + PI;
  */
  Psixz = atan2(px, pz);
  pxz = sqrt(px*px + pz*pz);

  if(coord==1) {
    x = x + deltas * px/normP + xplane*cos(Psixz);
    z = z - xplane * sin(Psixz);
  }

  if(coord==2) {
    x = x + deltas * px/normP + xplane*cos(Psixz);
    z = z - xplane * sin(Psixz) + deltas * pz / normP;
  }

  px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}

__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
				   bool enableRutherfordScattering)
{

  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
  double gamma = (Eng + M_P) / M_P;
  double normP = sqrt(dot(P, P));
  double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
  double deltas = par[DT_M] * beta * C;

  double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
    Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));

  // x-direction: See Physical Review, "Multiple Scattering"
  double z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
  double z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
  double thetacou = z2 * theta0;

  while(fabs(thetacou) > 3.5 * theta0) {
    z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
    z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
    thetacou = z2 * theta0;
  }

  //__syncthreads();
  double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
  Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);

  // y-direction: See Physical Review, "Multiple Scattering"
  z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
  z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
  thetacou = z2 * theta0;

  while(fabs(thetacou) > 3.5 * theta0) {
    z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
    z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
    thetacou = z2 * theta0;
  }

  //__syncthreads();

  double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
  Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);

  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
  if( (P2 < 0.0047) && enableRutherfordScattering) {
    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
    //double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
    double thetaru = 2.5 * sqrt(1 / P3) * 2.0 * theta0;
    double phiru = 2.0 * M_PI * curand_uniform_double(&state);
    double th0=atan2(sqrt(P.x*P.x+P.y*P.y),fabs(P.z));
    double3 W,X;

    double dotP = sqrt(dot(P,P));
    X.x = cos(phiru)*sin(thetaru) * dotP;
    X.y = sin(phiru)*sin(thetaru) * dotP;
    X.z = cos(thetaru) * dotP;
    W.x = -P.y;
    W.y = P.x;
    W.z = 0.0;
    P = ArbitraryRotation(W, X, th0);
  }

}


template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
					int numparticles, bool enableRutherfordScattering)
{

  //get global id and thread id
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

  //transfer params to shared memory
  extern __shared__ double smem[];
  double *p = (double*)smem;
  double3 *R = (double3*)&smem[NUMPAR];

  curandState s;
  double3 P;

  for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
    p[tt] = par[tt];

  __syncthreads();

  if (idx < numparticles) {
    s = state[idx];
    R[tid] = data[idx].Rincol;
    P = data[idx].Pincol;

    bool pdead = false;
    volatile double sq = sqrt(1.0 + dot(P, P));

    double Eng;

    if (checkHit(R[tid].z, p)) {

      Eng = (sq - 1) * M_P;
      energyLoss(Eng, pdead, s, p);

      if (!pdead) {
	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
	sq = sqrt(dot(P, P));

	P.x = P.x * ptot / sq;
	P.y = P.y * ptot / sq;
	P.z = P.z * ptot / sq;
	coulombScat(R[tid], P, s, p, enableRutherfordScattering);

	data[idx].Pincol = P;
      } else {
	data[idx].label = -1;
      }

      state[idx] = s;
    } else {

      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
      data[idx].label = -2;

    }

    data[idx].Rincol = R[tid];
  }

}

__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
					 curandState *state, int numparticles,
					 bool enableRutherfordScattering)
{

  //get global id and thread id
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

  //transfer params to shared memory
  __shared__ double p[NUMPAR];
  __shared__ double3 R[BLOCK_SIZE];

  if (tid < NUMPAR)
    p[tid] = par[tid];

  __syncthreads();

  curandState s;
  double3 P;
  if (idx < numparticles) {
    R[tid] = data.Rincol[idx];
    P = data.Pincol[idx];
    s = state[idx];

    double sq = sqrt(1.0 + dot(P, P));
    bool pdead = false;

    if (checkHit(R[tid].z, p)) {

      double Eng = (sq - 1) * M_P;
      energyLoss(Eng, pdead, s, p);

      if (!pdead) {

    	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
    	sq = sqrt(dot(P, P));
    	P.x = P.x * ptot / sq;
    	P.y = P.y * ptot / sq;
    	P.z = P.z * ptot / sq;
    	coulombScat(R[tid], P, s, p, enableRutherfordScattering);

    	data.Pincol[idx] = P;
    } else {
    	data.label[idx] = -1;
    }

    } else {
      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;

      data.label[idx] = -2;
    }

    data.Rincol[idx] = R[tid];
    state[idx] = s;
  }

}


inline __device__ void unitlessOff(double3 &a, const double &c) {
  a.x *= c;
  a.y *= c;
  a.z *= c;
}

inline __device__ void unitlessOn(double3 &a, const double &c) {
  a.x /= c;
  a.y /= c;
  a.z /= c;
}

//swithch to unitless positions with dtc
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {

  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];

    unitlessOn(R, dtc);
    unitlessOn(X, dtc);

    gR[idx] = R;
    gX[idx] = X;
  }

}

//swithc to unitless positions with dt*c
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {

  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    double dt = gdt[idx];

    unitlessOff(R, dt*c);
    unitlessOff(X, dt*c);

    gR[idx] = R;
    gX[idx] = X;
  }
}

//swithc off unitless positions with dtc
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {

  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];

    unitlessOff(R, dtc);
    unitlessOff(X, dtc);

    gR[idx] = R;
    gX[idx] = X;
  }

}

//switch off unitelss positions with dt*c
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {

  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    double dt = gdt[idx];

    unitlessOff(R, dt*c);
    unitlessOff(X, dt*c);

    gR[idx] = R;
    gX[idx] = X;
  }
}


__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {

  //get global id and thread id
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

  if (idx < npart) {

    double3 R = gR[idx];
    double3 P = gP[idx];

    //switch to unitless positions
    unitlessOn(R, dtc);

    //push
    double tmp = sqrt(1.0 + dot(P, P));
    R.x += 0.5 * P.x / tmp;
    R.y += 0.5 * P.y / tmp;
    R.z += 0.5 * P.z / tmp;

    //switch off unitless positions with dt*c
    unitlessOff(R, dtc);

    gR[idx] = R;
  }
}


__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {

  //get global id and thread id
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

  if (idx < npart) {

    double3 R = gR[idx];
    double3 P = gP[idx];
    double dt = gdt[idx];

    //switch to unitless positions with dt*c
    unitlessOn(R, dt*c);

    R.x += 0.5 * P.x / sqrt(1.0 + dot(P, P));
    R.y += 0.5 * P.y / sqrt(1.0 + dot(P, P));
    R.z += 0.5 * P.z / sqrt(1.0 + dot(P, P));

    //switch off unitless positions with dt*c
    unitlessOff(R, dt*c);

    gR[idx] = R;
  }
}

__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
			   double3 *gBf, double *gdt, double charge,
			   double mass, int npart, double c)
{
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

  if (idx < npart) {
    double3 R = gR[idx];
    double3 P = gP[idx];
    double3 Ef = gEf[idx];
    double3 Bf = gBf[idx];
    double dt = gdt[idx];

    P.x += 0.5 * dt * charge * c / mass * Ef.x;
    P.y += 0.5 * dt * charge * c / mass * Ef.y;
    P.z += 0.5 * dt * charge * c / mass * Ef.z;

    double gamma = sqrt(1.0 + dot(P, P));
    double3 t, w, s;
    t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
    t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
    t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;

    double3 crossPt = cross(P, t);
    w.x = P.x + crossPt.x;
    w.y = P.y + crossPt.y;
    w.z = P.z + crossPt.z;

    s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
    s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
    s.z = 2.0 / (1.0 + dot(t, t)) * t.z;

    double3 crossws = cross(w, s);
    P.x += crossws.x;
    P.y += crossws.y;
    P.z += crossws.z;

    P.x += 0.5 * dt * charge * c / mass * Ef.x;
    P.y += 0.5 * dt * charge * c / mass * Ef.y;
    P.z += 0.5 * dt * charge * c / mass * Ef.z;

    gP[idx] = P;
  }
}

__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {

  const double sina = sin(ori.x);
  const double cosa = cos(ori.x);
  const double sinb = sin(ori.y);
  const double cosb = cos(ori.y);
  const double sinc = sin(ori.z);
  const double cosc = cos(ori.z);

  double3 temp;
  temp.x = 0.0;
  temp.y = 0.0;
  temp.z = 0.0;

  temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
  temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x +
    (cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
  temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x +
    (sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;

  return temp;

}

__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
				    int npart, int nsect, double dtc)
{

  //get global id and thread id
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;


  if (idx < npart) {

    double3 X = gX[idx];
    double3 P = gP[idx];
    long lLastSection = gLastSection[idx];

    double3 ori;
    if (lLastSection > -1 && lLastSection < nsect) {
      ori = gOrient[lLastSection];
    } else {
      ori.x = 0.0;
      ori.y = 0.0;
      ori.z = 0.0;
    }

    double3 tmp = deviceTransformTo(P, ori);

    unitlessOn(X, dtc);

    X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
    X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
    X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));

    unitlessOff(X, dtc);

    gX[idx] = X;
  }

}

__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
				    int npart, int nsect, double *gdt, double c)
{

  //get global id and thread id
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;


  if (idx < npart) {

    double3 X = gX[idx];
    double3 P = gP[idx];
    long lLastSection = gLastSection[idx];
    double dt = gdt[idx];

    double3 ori;
    if (lLastSection > -1 && lLastSection < nsect) {
      ori = gOrient[lLastSection];
    } else {
      ori.x = 0.0;
      ori.y = 0.0;
      ori.z = 0.0;
    }

    double3 tmp = deviceTransformTo(P, ori);

    unitlessOn(X, dt*c);

    X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
    X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
    X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));

    unitlessOff(X, dt*c);

    gX[idx] = X;
  }

}

struct compare_particle
{
  int threshold;

  compare_particle() {
    threshold = 0;
  }

  void set_threshold(int t) {
    threshold = t;
  }

  __host__ __device__
  bool operator()(CUDA_PART p1, CUDA_PART p2) {
    return p1.label > p2.label;
  }

  __host__  __device__
  bool operator()(CUDA_PART p1) {
    return p1.label < threshold;
  }
};


struct compare_particle_small
{
  int threshold;

  compare_particle_small() {
    threshold = 0;
  }

  void set_threshold(int t) {
    threshold = t;
  }

  __host__ __device__
  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
    return p1.label > p2.label;
  }

  __host__  __device__
  bool operator()(CUDA_PART_SMALL p1) {
    return p1.label < threshold;
  }
};


struct less_then
{
  __host__ __device__
  bool operator()(int x)
  {
    return x < 0;
  }
};

int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
					     bool enableRutherfordScattering)
{

  int threads = BLOCK_SIZE;
  int blocks = numparticles / threads + 1;

  //calc shared memory size
  int smem_size = sizeof(double)*NUMPAR + sizeof(double3)*BLOCK_SIZE;

  //call kernel
  kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
							  (double*)par_ptr,
							  m_base->cuda_getCurandStates(),
							  numparticles,
							  enableRutherfordScattering);

  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
    std::cout << "Err2: " << cudaGetErrorString(err) << std::endl;

  return DKS_SUCCESS;

}

int CudaCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
						 int &numaddback)
{

  //wrap mem_ptr with thrust device ptr
  thrust::device_ptr<CUDA_PART_SMALL> dev_ptr( (CUDA_PART_SMALL*)mem_ptr);

  //count -2 and -1 particles
  compare_particle_small comp;
  comp.set_threshold(0);
  numaddback = thrust::count_if(dev_ptr, dev_ptr + numparticles, comp);

  //sort particles
  if (numaddback > 0)
    thrust::sort(dev_ptr, dev_ptr + numparticles, comp);

  return DKS_SUCCESS;
}

int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
						void *dt_ptr, double dt, double c, bool usedt,
						int streamId)
{

  int threads = BLOCK_SIZE;
  int blocks = npart / threads + 1;

  //call kernel
  if (!usedt) {
    if (streamId == -1) {
      kernelPush<<<blocks, threads >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
    } else {
      cudaStream_t cs = m_base->cuda_getStream(streamId);
      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
    }
  } else {
    if (streamId == -1) {
      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr,
				      (double*)dt_ptr, npart, c);
    } else {
      cudaStream_t cs = m_base->cuda_getStream(streamId);
      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
					      (double*)dt_ptr, npart, c);
    }
  }


  return DKS_SUCCESS;
}

int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
						void *bf_ptr, void *dt_ptr, double charge,
						double mass, int npart,
						double c, int streamId)
{

  int threads = BLOCK_SIZE;
  int blocks = npart / threads + 1;

  //call kernel
  if (streamId == -1) {
    kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
				    (double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
  } else {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
    kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
					    (double3*)ef_ptr, (double3*)bf_ptr,
					    (double*)dt_ptr, charge, mass,  npart, c);
  }

  return DKS_SUCCESS;
}

int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
							 void *lastSec_ptr, void *orient_ptr,
							 int npart, int nsec,
							 void *dt_ptr, double dt,
							 double c, bool usedt,
							 int streamId)
{

  int threads = BLOCK_SIZE;
  int blocks = npart / threads + 1;
  int smem = sizeof(double3) * nsec;

  //call kernel
  if (!usedt) {
    if (streamId == -1) {
      kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
						     (long*)lastSec_ptr, (double3*)orient_ptr,
						     npart, nsec, dt*c);
    } else {
      cudaStream_t cs = m_base->cuda_getStream(streamId);
      kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
							 (long*)lastSec_ptr, (double3*)orient_ptr,
							 npart, nsec, dt*c);
    }
  } else {
    if (streamId == -1) {
      kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
						     (long*)lastSec_ptr, (double3*)orient_ptr,
						     npart, nsec, (double*)dt_ptr, c);
    } else {
      cudaStream_t cs = m_base->cuda_getStream(streamId);
      kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
							 (long*)lastSec_ptr, (double3*)orient_ptr,
							 npart, nsec, (double*)dt_ptr, c);
    }
  }

  return DKS_SUCCESS;
}