#include #include #include #include #include "DKSBase.h" #include #include "cuda_runtime.h" #include using namespace std; typedef struct { int *label; unsigned *localID; double *rx; double *ry; double *rz; double *px; double *py; double *pz; } PART; void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz, double *px, double *py, double *pz, int npart) { for (int i = 0; i < npart; i++) { label[i] = 0; localID[i] = i; rx[i] = 0.0; ry[i] = 0.0; rz[i] = 0.02; px[i] = 0.0; py[i] = 0.0; pz[i] = 3.9920183237269791e-01; } } void initParams(double *data) { data[0] = 0.0;//2.0000000000000000e-02; data[1] = 1.0;//1.0000000000000000e-02; data[2] = 2.2100000000000000e+00; data[3] = 6.0000000000000000e+00; data[4] = 1.2010700000000000e+01; data[5] = 2.6010000000000000e+00; data[6] = 1.7010000000000000e+03; data[7] = 1.2790000000000000e+03; data[8] = 1.6379999999999999e-02; data[9] = 1.9321266968325795e-01; data[10] = 7.9000000000000000e+01; data[11] = 1.0000000000000002e-12; } int main(int argc, char *argv[]) { int loop = 10; int numpart = 1e5; char *api_name = new char[10]; char *device_name = new char[10]; strcpy(api_name, "Cuda"); strcpy(device_name, "-gpu"); for (int i = 1; i < argc; i++) { if (argv[i] == string("-mic")) { strcpy(api_name, "OpenMP"); strcpy(device_name, "-mic"); } if (argv[i] == string("-npart")) { numpart = atoi(argv[i+1]); i++; } if (argv[i] == string("-loop")) { loop = atoi(argv[i+1]); i++; } } int threads = 0; /* #pragma offload target(mic:0) out(threads) { #pragma omp parallel { threads = omp_get_num_threads(); } } */ cout << "=========================BEGIN TEST=========================" << endl; cout << "Use api: " << api_name << "\t" << device_name << endl; cout << "Number of particles: " << numpart << endl; cout << "Number of loops: " << loop << endl; cout << "Number of threads: " << threads << endl; cout << "------------------------------------------------------------" << endl; //init part vector to test mc //int *label; //unsigned *localID; //double *rx, *ry, *rz, *px, *py, *pz; PART p; p.label = (int*) _mm_malloc(sizeof(int)*numpart, 64); p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64); p.rx = (double*) _mm_malloc(sizeof(double)*numpart, 64); p.ry = (double*) _mm_malloc(sizeof(double)*numpart, 64); p.rz = (double*) _mm_malloc(sizeof(double)*numpart, 64); p.px = (double*) _mm_malloc(sizeof(double)*numpart, 64); p.py = (double*) _mm_malloc(sizeof(double)*numpart, 64); p.pz = (double*) _mm_malloc(sizeof(double)*numpart, 64); initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart); double *params = new double[12]; initParams(params); //init dks int ierr; DKSBase base; base.setAPI(api_name, strlen(api_name)); base.setDevice(device_name, strlen(api_name)); base.initDevice(); //init random base.callInitRandoms(numpart); //**test collimator physics and sort***// void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr; //allocate memory for particles label_ptr = base.allocateMemory(numpart, ierr); localID_ptr = base.allocateMemory(numpart, ierr); rx_ptr = base.allocateMemory(numpart, ierr); ry_ptr = base.allocateMemory(numpart, ierr); rz_ptr = base.allocateMemory(numpart, ierr); px_ptr = base.allocateMemory(numpart, ierr); py_ptr = base.allocateMemory(numpart, ierr); pz_ptr = base.allocateMemory(numpart, ierr); param_ptr = base.allocateMemory(12, ierr); //transfer data to device base.writeData(label_ptr, p.label, numpart); base.writeData(localID_ptr, p.localID, numpart); base.writeData(rx_ptr, p.rx, numpart); base.writeData(ry_ptr, p.ry, numpart); base.writeData(rz_ptr, p.rz, numpart); base.writeData(px_ptr, p.px, numpart); base.writeData(py_ptr, p.py, numpart); base.writeData(pz_ptr, p.pz, numpart); //transfer params to device base.writeData(param_ptr, params, 12); std::cout << "test runs" << std::endl; int numaddback; //test calls to do some first executions base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, py_ptr, pz_ptr, param_ptr, numpart); base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, py_ptr, pz_ptr, param_ptr, numpart, numaddback); base.syncDevice(); struct timeval timeStart, timeEnd; std::cout << "Start MC" << std::endl; gettimeofday(&timeStart, NULL); for (int i = 0; i < loop; i++) { base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, py_ptr, pz_ptr, param_ptr, numpart); base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, py_ptr, pz_ptr, param_ptr, numpart, numaddback); base.syncDevice(); } gettimeofday(&timeEnd, NULL); std::cout << "addback: " << numaddback << std::endl; std::cout << "End MC" << std::endl; double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + (timeEnd.tv_usec - timeStart.tv_usec)); std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl; std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl; //read data from device base.readData(label_ptr, p.label, numpart); base.readData(localID_ptr, p.localID, numpart); base.readData(rx_ptr, p.rx, numpart); base.readData(ry_ptr, p.ry, numpart); base.readData(rz_ptr, p.rz, numpart); base.readData(px_ptr, p.px, numpart); base.readData(py_ptr, p.py, numpart); base.readData(pz_ptr, p.pz, numpart); //free memory base.freeMemory(label_ptr, numpart); base.freeMemory(localID_ptr, numpart); base.freeMemory(rx_ptr, numpart); base.freeMemory(ry_ptr, numpart); base.freeMemory(rz_ptr, numpart); base.freeMemory(px_ptr, numpart); base.freeMemory(py_ptr, numpart); base.freeMemory(pz_ptr, numpart); base.freeMemory(param_ptr, 12); /* std::cout << std::fixed << std::setprecision(4); for (int i = 0; i < 10; i++) { std::cout << p.label[i] << "\t" << p.rx[i] << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl; } std:: cout << "..." << std::endl; for (int i = numpart - 10; i < numpart; i++) { std::cout << p.label[i] << "\t" << p.rx[i] << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl; } double arx = 0, ary = 0, arz = 0; double apx = 0, apy = 0, apz = 0; for (int i = 0; i < numpart; i++) { arx += sqrt(p.rx[i] * p.rx[i]) / numpart; ary += sqrt(p.ry[i] * p.ry[i]) / numpart; arz += sqrt(p.rz[i] * p.rz[i]) / numpart; apx += sqrt(p.px[i] * p.px[i]) / numpart; apy += sqrt(p.py[i] * p.py[i]) / numpart; apz += sqrt(p.pz[i] * p.pz[i]) / numpart; } std::cout << std::fixed << std::setprecision(10); std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl; */ cout << "==========================END TEST==========================" << endl; return 0; }