Files
DKS/src/MIC/MICCollimatorPhysics.cpp
2016-11-16 18:12:17 +01:00

886 lines
26 KiB
C++

#include "MICCollimatorPhysics.h"
#define M_P 0.93827231e+00
#define C 299792458.0
#define PI 3.14159265358979323846
#define AVO 6.022e23
#define R_E 2.81794092e-15
#define eM_E 0.51099906e-03
#define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
#define POSITION 0
#define ZSIZE 1
#define RHO_M 2
#define Z_M 3
#define A_M 4
#define A2_C 5
#define A3_C 6
#define A4_C 7
#define A5_C 8
#define X0_M 9
#define I_M 10
#define DT_M 11
__declspec(target(mic))
double dot(mic_double3 d1, mic_double3 d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
}
__declspec(target(mic))
double dot(double dx, double dy, double dz) {
return (dx * dx + dy * dy + dz * dz);
}
__declspec(target(mic))
bool checkHit(double &z, double *par) {
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
}
__declspec(target(mic))
void Rot(double &px, double &pz, double &x, double &z, double xplane,
double normP, double thetacou, double deltas, int coord)
{
double Psixz = 1;
double pxz = 1;
if ( px >= 0 && pz >= 0 )
Psixz = atan(px/pz);
else if ( px > 0 && pz < 0 )
Psixz = atan(px/pz) + PI;
else if (px < 0 && pz > 0)
Psixz = atan(px/pz) + 2*PI;
else
Psixz = atan(px/pz) + PI;
pxz = sqrt(px*px + pz*pz);
if(coord == 1) {
x = x + deltas * px / normP + xplane*cos(Psixz);
z = z - xplane * sin(Psixz);
}
if(coord == 2) {
x = x + deltas * px / normP + xplane * cos(Psixz);
z = z - xplane * sin(Psixz) + deltas * pz / normP;
}
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}
__declspec(target(mic))
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P;
double normP = sqrt(dot(P, P));
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
double deltas = par[DT_M] * beta * C;
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
// x-direction: See Physical Review, "Multiple Scattering"
double z1, z2;
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
double thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
thetacou = z2 * theta0;
}
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1);
double P2;//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
if(P2 < 0.0047) {
double P3, P4;
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
if(P4 > 0.5)
thetaru = -thetaru;
Rot(P.x ,P.z, R.x, R.z, xplane, normP, thetaru, deltas, 0);
}
// y-direction: See Physical Review, "Multiple Scattering"
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
thetacou = z2 * theta0;
}
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.y, P.z, R.y, R.z, yplane, normP, thetacou, deltas, 2);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
if(P2 < 0.0047) {
double P3, P4;
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
if(P4 > 0.5)
thetaru = -thetaru;
Rot(P.y, P.z, R.y, R.z, yplane, normP, thetaru, deltas, 0);
}
}
__declspec(target(mic))
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
double *par, VSLStreamStatePtr &stream, int ii, int size)
{
double normP[MIC_WIDTH] __attribute__((aligned(64)));
double deltas[MIC_WIDTH] __attribute__((aligned(64)));
double theta0[MIC_WIDTH] __attribute__((aligned(64)));
double P1[MIC_WIDTH] __attribute__((aligned(64)));
double P2[MIC_WIDTH] __attribute__((aligned(64)));
double P3[MIC_WIDTH] __attribute__((aligned(64)));
double z1[MIC_WIDTH] __attribute__((aligned(64)));
double z2[MIC_WIDTH] __attribute__((aligned(64)));
double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
double dotp = dot(px[i], py[i], pz[i]);
double Eng = sqrt(dotp + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P;
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
normP[idx] = sqrt(dotp);
deltas[idx] = par[DT_M] * beta * C;
theta0[idx] = 13.6e6 / (beta * normP[idx] * M_P * 1e9) *
Z_P * sqrt(deltas[idx] / par[X0_M]) * (1.0 + 0.038 * log(deltas[idx] / par[X0_M]));
}
}
// x-direction: See Physical Review, "Multiple Scattering"
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0);
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0);
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + size; i++) {
int idx = i - ii;
thetacou[idx] = z2[idx] * theta0[idx];
}
//unknown number of iterations, cannot vectorize
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
thetacou[idx] = z2[idx] * theta0[idx];
}
}
}
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + size; i++) {
int idx = i - ii;
if (label[i] == 0) {
double xplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) +
z2[idx] * deltas[idx] * theta0[idx] / 2.0;
Rot(px[i], pz[i], rx[i], rz[i], xplane, normP[idx], thetacou[idx], deltas[idx], 1);
}
}
//generate array of random numbers
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
//P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
if(P1[idx] < 0.0047) {
double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
if(P3[idx] > 0.5)
thetaru = -thetaru;
Rot(px[i] ,pz[i], rx[i], rz[i], 0, 0, thetaru, 0, 0);
}
}
}
// y-direction: See Physical Review, "Multiple Scattering"
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0);
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0);
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
thetacou[idx] = z2[idx] * theta0[idx];
}
//unknown number of iterations, cannot vectorize
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
thetacou[idx] = z2[idx] * theta0[idx];
}
}
}
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
double yplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0)
+ z2[idx] * deltas[idx] * theta0[idx] / 2.0;
Rot(py[i], pz[i], ry[i], rz[i], yplane, normP[idx], thetacou[idx], deltas[idx], 2);
}
}
//generate array of random numbers
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
//P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
if(P1[idx] < 0.0047) {
double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
if(P3[idx] > 0.5)
thetaru = -thetaru;
Rot(py[i], pz[i], ry[i], rz[i], 0, 0, thetaru, 0, 0);
}
}
}
}
__declspec(target(mic))
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
double dEdx = 0.0;
const double gamma = (Eng + M_P) / M_P;
const double gamma2 = gamma * gamma;
const double beta = sqrt(1.0 - 1.0 / gamma2);
const double beta2 = beta * beta;
const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073;
const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
double tmprnd;
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
const double delta_E = deltasrho * dEdx + tmprnd;
Eng = Eng + delta_E / 1E3;
}
if (Eng >= 0.0006) {
const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
Tmax / par[I_M] / par[I_M]) - beta2);
double tmprnd;
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
const double delta_E = deltasrho * dEdx + tmprnd;
Eng = Eng + delta_E / 1E3;
}
if ((Eng<1E-4) || (dEdx>0))
pdead = 1;
}
__declspec(target(mic))
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
const double gamma = (Eng + M_P) / M_P;
const double gamma2 = gamma * gamma;
const double beta = sqrt(1.0 - 1.0 / gamma2);
const double beta2 = beta * beta;
const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073;
const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
const double delta_E = deltasrho * dEdx + sigma_E * randv[ri];
Eng = Eng + delta_E / 1E3;
}
if (Eng >= 0.0006) {
const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
Tmax / par[I_M] / par[I_M]) - beta2);
const double delta_E = deltasrho * dEdx + sigma_E * randv[ri + MIC_WIDTH];
Eng = Eng + delta_E / 1E3;
}
}
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
//cast device memory pointers to appropriate types
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
double *par = (double*) par_ptr;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles)
{
#pragma omp parallel
{
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x
#pragma omp for simd
for (int i = 0; i < numparticles; i++) {
if ( !checkHit(data[i].Rincol.z, par) ) {
double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
data[i].Rincol.x = data[i].Rincol.x + par[DT_M] * C * data[i].Pincol.x / sq;
data[i].Rincol.y = data[i].Rincol.y + par[DT_M] * C * data[i].Pincol.y / sq;
data[i].Rincol.z = data[i].Rincol.z + par[DT_M] * C * data[i].Pincol.z / sq;
data[i].label = -2;
}
}
//for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
#pragma omp for simd
for (int i = 0; i < numparticles; i++) {
int pdead = -1;
double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
double Eng = (sq - 1) * M_P;
if (data[i].label == 0) {
energyLoss(Eng, pdead, par, stream);
}
if (pdead == -1) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(data[i].Pincol, data[i].Pincol));
data[i].Pincol.x = data[i].Pincol.x * ptot / sq;
data[i].Pincol.y = data[i].Pincol.y * ptot / sq;
data[i].Pincol.z = data[i].Pincol.z * ptot / sq;
}
if (pdead == 1)
data[i].label = -1;
}
//for loop trough particles if label == 0 coulomb scat
#pragma omp for
for (int i = 0; i < numparticles; i++) {
if (data[i].label == 0) {
coulombScat(data[i].Rincol, data[i].Pincol, par, stream);
}
}
} //end omp parallel
} //end offload
return DKS_SUCCESS;
}
int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
int *label = (int*)label_ptr;
unsigned *localID = (unsigned*)localID_ptr;
double *rx = (double*)rx_ptr;
double *ry = (double*)ry_ptr;
double *rz = (double*)rz_ptr;
double *px = (double*)px_ptr;
double *py = (double*)py_ptr;
double *pz = (double*)pz_ptr;
double *par = (double*)par_ptr;
int padding = numparticles % MIC_WIDTH;
int totalpart = numparticles + padding;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
in(rx:length(0) DKS_REUSE DKS_RETAIN) \
in(ry:length(0) DKS_REUSE DKS_RETAIN) \
in(rz:length(0) DKS_REUSE DKS_RETAIN) \
in(px:length(0) DKS_REUSE DKS_RETAIN) \
in(py:length(0) DKS_REUSE DKS_RETAIN) \
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(totalpart)
{
#pragma omp parallel
{
//every thread gets its own rnd stream state
//VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
//vectorize main loop
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
if ( !checkHit(rz[i], par) ) {
double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
rx[i] = rx[i] + par[DT_M] * C * px[i] / sq;
ry[i] = ry[i] + par[DT_M] * C * py[i] / sq;
rz[i] = rz[i] + par[DT_M] * C * pz[i] / sq;
label[i] = -2;
}
}
}
//array of size 2*WIDTH for storing random values for the energyloss function
double randv[2*MIC_WIDTH] __attribute__((aligned(64)));
//for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
//create array of rand values (2 per thread)
vdRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*MIC_WIDTH, randv, 0.0, 1.0);
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
double Eng = (sq - 1) * M_P;
double dEdx = 0;
if (label[i] == 0) {
energyLoss(Eng, dEdx, par, randv, i - ii);
}
if (Eng > 1e-4 && dEdx < 0) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(px[i], py[i], pz[i]));
px[i] = px[i] * ptot / sq;
py[i] = py[i] * ptot / sq;
pz[i] = pz[i] * ptot / sq;
}
if (Eng < 1e-4 || dEdx > 0)
label[i] = -1;
} //end inner energy loss loop
} //end outer energy loss loop
//vectorize coulomb scattering as much as possible
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
coulombScat(rx, ry, rz, px, py, pz, label, par, stream, ii, MIC_WIDTH);
} //end coulomb scattering
} //end omp parallel
} //end offload
return DKS_SUCCESS;
}
int MICCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
int &numaddback)
{
//cast device memory pointers to appropriate types
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
int privateback;
#pragma offload target(mic:m_micbase->m_device_id) \
in(data:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles) \
out(privateback)
{
//count dead and addback particles
int privateback = 0;
#pragma omp parallel for reduction(+:privateback)
for (int i = 0; i < numparticles; i++) {
if (data[i].label < 0)
privateback++;
}
//move particles with label < 0 to the end of the array (serial. can we do this parallel?)
if (privateback > 0) {
int moved = 0;
for (int i = numparticles - 1; i > 0; i--) {
if (data[i].label < 0) {
int idx = numparticles - 1 - moved;
if (i != idx) {
MIC_PART_SMALL tmp = data[i];
data[i] = data[idx];
data[idx] = tmp;
}
moved++;
}
}
}
numaddback = privateback;
}
return DKS_SUCCESS;
}
__declspec(target(mic))
void micmove(double &a, double &b) {
double tmp = a;
a = b;
b = tmp;
}
__declspec(target(mic))
void micmove(int &a, int &b) {
int tmp = a;
a = b;
b = tmp;
}
__declspec(target(mic))
void micmove(unsigned &a, unsigned &b) {
unsigned tmp = a;
a = b;
b = tmp;
}
int MICCollimatorPhysics::CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles,
int &numaddback)
{
int *label = (int*)label_ptr;
unsigned *localID = (unsigned*)localID_ptr;
double *rx = (double*)rx_ptr;
double *ry = (double*)ry_ptr;
double *rz = (double*)rz_ptr;
double *px = (double*)px_ptr;
double *py = (double*)py_ptr;
double *pz = (double*)pz_ptr;
double *par = (double*)par_ptr;
//int padding = numparticles % WIDTH;
//int totalpart = numparticles + padding;
int privateback;
#pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
in(rx:length(0) DKS_REUSE DKS_RETAIN) \
in(ry:length(0) DKS_REUSE DKS_RETAIN) \
in(rz:length(0) DKS_REUSE DKS_RETAIN) \
in(px:length(0) DKS_REUSE DKS_RETAIN) \
in(py:length(0) DKS_REUSE DKS_RETAIN) \
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles) \
out(privateback)
{
//count dead and addback particles
int privateback = 0;
#pragma omp parallel for reduction(+:privateback)
for (int i = 0; i < numparticles; i++) {
if (label[i] < 0)
privateback++;
}
//move particles with label < 0 to the end of the array (serial. can we do this parallel?)
if (privateback > 0) {
int moved = 0;
for (int i = numparticles - 1; i >= 0; i--) {
if (label[i] < 0) {
int idx = numparticles - 1 - moved;
if (i != idx) {
micmove(rx[i], rx[idx]);
micmove(ry[i], ry[idx]);
micmove(rz[i], rz[idx]);
micmove(px[i], px[idx]);
micmove(py[i], py[idx]);
micmove(pz[i], pz[idx]);
micmove(label[i], label[idx]);
micmove(localID[i], localID[idx]);
}
moved++;
}
}
}
numaddback = privateback;
}
return DKS_SUCCESS;
}
__declspec(target(mic))
inline void unitlessOff(mic_double3 &a, const double c) {
a.x *= c;
a.y *= c;
a.z *= c;
}
__declspec(target(mic))
inline void unitlessOn(mic_double3 &a, const double c) {
a.x /= c;
a.y /= c;
a.z /= c;
}
__declspec(target(mic))
mic_double3 deviceTransformTo(const mic_double3 &vec, const mic_double3 &ori) {
const double sina = sin(ori.x);
const double cosa = cos(ori.x);
const double sinb = sin(ori.y);
const double cosb = cos(ori.y);
const double sinc = sin(ori.z);
const double cosc = cos(ori.z);
mic_double3 temp;
temp.x = 0.0;
temp.y = 0.0;
temp.z = 0.0;
temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x +
(cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x +
(sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
return temp;
}
__declspec(target(mic))
inline void updateR(mic_double3 &R, mic_double3 &P, double dotp, double dtc) {
R.x /= dtc;
R.x += 0.5 * P.x / dotp;
R.x *= dtc;
R.y /= dtc;
R.y += 0.5 * P.y / dotp;
R.y *= dtc;
R.z /= dtc;
R.z += 0.5 * P.z / dotp;
R.z *= dtc;
}
__declspec(target(mic))
inline void push(mic_double3 *r, mic_double3 *p, double dtc, int npart) {
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 R = r[i];
mic_double3 P = p[i];
double dotp = sqrt(1.0 + dot(P, P));
updateR(R, P, dotp, dtc);
r[i] = R;
}
}
__declspec(target(mic))
inline void push(mic_double3 *r, mic_double3 *p, double *gdt, double c, int npart) {
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 R = r[i];
mic_double3 P = p[i];
double dtc = gdt[i] * c;
double dotp = sqrt(1.0 + dot(P, P));
updateR(R, P, dotp, dtc);
r[i] = R;
}
}
int MICCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt, int streamId)
{
mic_double3 *r = (mic_double3*)r_ptr;
mic_double3 *p = (mic_double3*)p_ptr;
double *gdt = (double*)dt_ptr;
double dtc = dt * c;
if (!usedt) {
#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(npart, dtc)
{
push(r, p, dtc, npart);
}
} else {
#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) in(npart, c)
{
push(r, p, gdt, c, npart);
}
}
return DKS_SUCCESS;
}
__declspec(target(mic))
inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
double dtc, int npart, int nsec)
{
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 ori;
if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
ori = gOrient[gLastSect[i]];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
mic_double3 tmp = deviceTransformTo(p[i], ori);
mic_double3 X = x[i];
double dotp = sqrt(1.0 + dot(tmp, tmp));
updateR(X, tmp, dotp, dtc);
x[i] = X;
}
}
__declspec(target(mic))
inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
double *gdt, double c, int npart, int nsec)
{
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 ori;
if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
ori = gOrient[gLastSect[i]];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
mic_double3 tmp = deviceTransformTo(p[i], ori);
mic_double3 X = x[i];
double dotp = sqrt(1.0 + dot(tmp, tmp));
double dtc = gdt[i] * c;
updateR(X, tmp, dotp, dtc);
x[i] = X;
}
}
int MICCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr,
void *orient_ptr, int npart,
int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
mic_double3 *x = (mic_double3*)x_ptr;
mic_double3 *p = (mic_double3*)p_ptr;
mic_double3 *gOrient = (mic_double3*)orient_ptr;
double *gdt = (double*)dt_ptr;
long *gLastSect = (long*)lastSec_ptr;
double dtc = dt * c;
if (!usedt) {
#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gOrient:length(0) DKS_RETAIN DKS_REUSE) \
in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) in(npart, nsec, dtc)
{
pushTransform(x, p, gOrient, gLastSect, dtc, npart, nsec);
}
} else {
#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) \
in(gOrient:length(0) DKS_RETAIN DKS_REUSE) in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) \
in(npart, nsec, c)
{
pushTransform(x, p, gOrient, gLastSect, gdt, c, npart, nsec);
}
}
return DKS_SUCCESS;
}