snapshot of svn

This commit is contained in:
Uldis Locans
2016-10-10 14:49:32 +02:00
commit 4fa529aaea
122 changed files with 23153 additions and 0 deletions

84
test/CMakeLists.txt Normal file
View File

@ -0,0 +1,84 @@
INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#ADD_EXECUTABLE(testDKS testDKS.cpp)
#ADD_EXECUTABLE(testChi testChi.cpp)
#ADD_EXECUTABLE(testFFT testFFT.cpp)
#ADD_EXECUTABLE(testMIC testMIC.cpp)
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp)
#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp)
#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp)
#ADD_EXECUTABLE(testOffset testOffset.cpp)
#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp)
#ADD_EXECUTABLE(testMPI testMPI.cpp)
#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp)
#ADD_EXECUTABLE(testGather testGather.cpp)
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
#ADD_EXECUTABLE(testPush testPush.cpp)
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
#shared library
#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp)
#TARGET_LINK_LIBRARIES(testDKS dks)
#TARGET_LINK_LIBRARIES(testChi dks)
#TARGET_LINK_LIBRARIES(testFFT dks)
#TARGET_LINK_LIBRARIES(testMIC dks)
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
#TARGET_LINK_LIBRARIES(testFFT3D dks)
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
#TARGET_LINK_LIBRARIES(testStockFFT3D dks)
#TARGET_LINK_LIBRARIES(testMemObjects dks)
#TARGET_LINK_LIBRARIES(testRCFFT dks)
#TARGET_LINK_LIBRARIES(testOffset dks)
#TARGET_LINK_LIBRARIES(testOffsetMPI dks)
#TARGET_LINK_LIBRARIES(testMPI dks)
#TARGET_LINK_LIBRARIES(testMPIFFT dks)
#TARGET_LINK_LIBRARIES(testGather dks)
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
#TARGET_LINK_LIBRARIES(testTranspose dks)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
#TARGET_LINK_LIBRARIES(testPush dks)
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
#TARGET_LINK_LIBRARIES(testIntegration dks)
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared)
#IF (${COMPILER_NAME} STREQUAL "mpicxx")
#ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp)
#ADD_EXECUTABLE(testGreens testGreens.cpp)
#ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp)
#ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp)
#TARGET_LINK_LIBRARIES(testGatherAsync2 dks)
#TARGET_LINK_LIBRARIES(testGreens dks)
#TARGET_LINK_LIBRARIES(testFFTSolver dks)
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks)
#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp)
#TARGET_LINK_LIBRARIES(testChiSquare dks)
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")

141
test/testChi.cpp Normal file
View File

@ -0,0 +1,141 @@
#include <iostream>
#include <complex>
#include <cstdlib>
#include "DKSBase.h"
#include "Utility/TimeStamp.h"
using namespace std;
int main(int argc, char *argv[]) {
char *api_name = new char[10];
char *device_name = new char[4];
if (argc == 3) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
} else if (argc == 2){
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << endl;
cout << "Begin DKS Base tests" << endl;
/* inti data */
int ierr;
int nsize = 4000000;
int jsize = 16;
int psize = 6;
double *data = new double[nsize*jsize];
double *p = new double[psize*jsize];
double data_out = 0;
srand(time(NULL));
for (int i = 0; i < nsize*jsize; i++) {
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
//data[i] = sign*(double)rand()/RAND_MAX;
data[i] = (double)i / (nsize*jsize);
//data[i] = 1;
}
for (int i = 0; i < psize*jsize; i++) {
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
//p[i] = sign*(double)rand()/RAND_MAX;
p[i] = (double)i / (nsize*jsize);
//p[i] = 1;
}
/* end init */
timestamp_t tstart, tend;
//timestamp_t t0, t1;
tstart = get_timestamp();
//init dks base class, set API to opencl and init connection with OpenCL device
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
//ptrs to hold reference to device memory
void *dptr, *ntptr, *pptr;
//allocate memory on device
//t0 = get_timestamp();
dptr = base.allocateMemory<double>(nsize*jsize, ierr);
ntptr = base.allocateMemory<double>(nsize*jsize, ierr);
pptr = base.allocateMemory<double>(psize*jsize, ierr);
//t1 = get_timestamp();
//cout << "Allocate memory: " << get_secs(t0, t1) << endl;
//write data to device
//t0 = get_timestamp();
base.writeData<double>(dptr, data, nsize*jsize);
//t1 = get_timestamp();
//cout << "Write data set: " << get_secs(t0, t1) << endl << endl;
for (int i = 0; i < 5; i++) {
//write parameters to device
//t0 = get_timestamp();
base.writeData<double>(pptr, p, psize*jsize);
//t1 = get_timestamp();
//cout << "Write parameters: " << get_secs(t0, t1) << endl;
//set function to calcNt and execute it with necessary parameters
//t0 = get_timestamp();
base.callNt<double>(ntptr, pptr, psize, nsize, jsize, 0.025);
//t1 = get_timestamp();
//cout << "Calc N(t): " << get_secs(t0, t1) << endl;
//set function to chi2 and execute it with necessary parameters
//t0 = get_timestamp();
base.callChi2<double>(ntptr, dptr, ntptr, nsize*jsize);
//t1 = get_timestamp();
//cout << "Calc chi^2: " << get_secs(t0, t1) << endl;
//set function so sum and execute it with necessary parameters
//t0 = get_timestamp();
base.callSum<double>(ntptr, ntptr, nsize*jsize);
//t1 = get_timestamp();
//cout << "Calc sum: " << get_secs(t0, t1) << endl;
//read calculated sum (one value)
//t0 = get_timestamp();
base.readData<double>(ntptr, &data_out, 1);
//t1 = get_timestamp();
//cout << "Read sum: " << get_secs(t0, t1) << endl;
cout << "Sum nt: " << data_out << endl;
/*
for (int i = 0; i < psize*jsize; i++) {
int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
p[i] = sign*(double)rand()/RAND_MAX;
}
*/
//cout << endl;
}
//free device memory
//t0 = get_timestamp();
base.freeMemory<double>(dptr, nsize*jsize);
base.freeMemory<double>(ntptr, nsize*jsize);
base.freeMemory<double>(pptr, psize*jsize);
//t1 = get_timestamp();
//cout << "Free memory: " << get_secs(t0, t1) << endl;
tend = get_timestamp();
cout << endl << "time: " << get_secs(tstart, tend) << endl;
return 0;
}

168
test/testChiSquare.cpp Normal file
View File

@ -0,0 +1,168 @@
#include <iostream>
#include <vector>
#include "DKSBase.h"
using namespace std;
void initData(vector< vector<double> > &v, int length) {
for (unsigned int i = 0; i < v.size(); i++) {
for (int j = 0; j < length; j++) {
v[i].push_back(j);
}
}
}
void printData(vector< vector<double> > &v) {
for (unsigned int i = 0; i < v.size(); i++) {
for (unsigned int j = 0; j < v[i].size(); j++) {
cout << v[i][j] << "\t";
}
cout << endl;
}
}
void initData(double *data, int sensors, int length) {
for (int i = 0; i < sensors; i++) {
for (int j = 0; j < length; j++) {
data[i*length + j] = j;
}
}
}
void printData(double *data, int sensors, int length) {
for (int i = 0; i < sensors; i++) {
for (int j = 0; j < length; j++) {
cout << data[i*length + j] << "\t";
}
cout << endl;
}
}
void initPar(double *par, int npar) {
for (int i = 0; i < npar; i++)
par[i] = (double)i / npar;
}
void printDiv(int size) {
for (int i = 0; i < size; i++)
cout << "=";
cout << endl;
}
void calcChisq(vector< vector<double> > fData, double * par, double fTimeResolution, double fRebin)
{
double chisq = 0.0;
double theo, data;
const double tau=2.197019;
const double dt0 = fTimeResolution*0.5*(fRebin-1);
double time;
double w = par[0]*0.08516155035269027;
unsigned int i, j;
for (i=0; i<fData.size(); i++) {
for (j=0; j<fData[0].size(); j++) {
data = fData[i][j];
time = dt0+fTimeResolution*fRebin*j;
theo = par[2 + i*4] * exp(-time/tau)*(1.0 + par[3 + i*4]*exp(-0.5 * pow(par[1]*time,2.0))*cos(w*time+par[4+i*4]*1.74532925199432955e-2))+par[5+i*4];
if (data != 0.0) {
chisq += (theo-data)*(theo-data)/data;
cout << (theo-data)*(theo-data)/data << "\t";
} else {
chisq += theo*theo;
cout << theo*theo << "\t";
}
}
cout << endl;
}
cout << "Chisq: " << chisq << endl;
}
int main(int argc, char *argv[]) {
bool useCuda = true;
if (argc == 2 && atoi(argv[1]) == 1)
useCuda = false;
int ierr;
int sensors = 5;
int length = 10;
int npar = 4 * sensors + 2;
int ndata = sensors * length;
double result;
double fTimeResolution = 0.05;
double fRebin = 5;
double *par = new double[npar];
initPar(par, npar);
vector< vector< double > > fData;
fData.resize(sensors);
initData(fData, length);
printData(fData);
printDiv(75);
DKSBase dksbase;
if (useCuda)
dksbase.setAPI("Cuda", 4);
else
dksbase.setAPI("OpenCL", 6);
dksbase.setDevice("-gpu", 4);
dksbase.initDevice();
dksbase.setupFFT(0, NULL);
void *mem_data, *mem_par, *mem_chisq;
cout << "Allocate memory" << endl;
mem_par = dksbase.allocateMemory<double>(npar, ierr);
mem_data = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
mem_chisq = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
cout << "Write data" << endl;
dksbase.writeData<double>(mem_par, par, npar);
for (int i = 0; i < sensors; i++)
dksbase.writeData<double>(mem_data, &fData[i][0], length, i*length);
cout << "Call PHistoTFFcn" << endl;
dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, npar, result);
cout << "Result: " << result << endl;
double *out_data = new double[ndata];
dksbase.readData<double>(mem_chisq, out_data, ndata);
printDiv(75);
printData(out_data, sensors, length);
printDiv(75);
calcChisq(fData, par, fTimeResolution, fRebin);
printDiv(75);
cout << "Free memory" << endl;
dksbase.freeMemory<double>(mem_par, npar);
dksbase.freeMemory<double>(mem_data, ndata);
dksbase.freeMemory<double>(mem_chisq, ndata);
return 0;
}

193
test/testChiSquareRT.cpp Normal file
View File

@ -0,0 +1,193 @@
#include <iostream>
#include <cstdlib>
#include <string>
#include <cmath>
#include <omp.h>
#include "DKSBaseMuSR.h"
#include "Utility/DKSTimer.h"
void initData(double *data, int N, bool ones = false) {
for (int i = 0; i < N; i++) {
if (ones)
data[i] = 1.0;
else
data[i] = (double)rand() / RAND_MAX;
}
}
template <typename T>
void printData(T *data, int N) {
for (int i = 0; i < N; i++)
std::cout << data[i] << "\t";
std::cout << std::endl;
}
const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])";
//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])";
//const std::string funct = "p[m[0]] * se(t, p[m[1]])";
//const std::string funct = "p[m[1]] + p[m[0]]";
double fTheory(double time, double *par, double *func, int *map) {
return cos(time*par[0]) - exp(-time*par[map[0]]);
}
double testFunctionSerial(double *data, double *par, double *func, int *map,
double N0, double tau, double bkg, double timeStep,
int startTimeBin, int endTimeBin)
{
double time, diff, theo;
double chisq = 0;
for (int i = startTimeBin; i < endTimeBin; ++i) {
time = i * timeStep;
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
diff = data[i] - theo;
chisq += diff * diff / data[i];
}
return chisq;
}
double testFunctionParallel(double *data, double *par, double *func, int *map,
double N0, double tau, double bkg, double timeStep,
int startTimeBin, int endTimeBin)
{
int i, chunk;
double time, diff, theo;
double chisq = 0;
chunk = (endTimeBin - startTimeBin) / omp_get_num_procs();
if (chunk < 10)
chunk = 10;
#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq)
for (i = startTimeBin; i < endTimeBin; ++i) {
time = i * timeStep;
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
diff = data[i] - theo;
chisq += diff * diff / data[i];
}
return chisq;
}
int main(int argc, char *argv[]) {
int Loop = 100;
//init test data on the host
int Ndata = 8;
if (argc > 1)
Ndata = atoi(argv[1]);
int api = 1;
if (argc > 2)
api = atoi(argv[2]);
int Npar = 66;
int Nfunc = 1;
int Nmap = 4;
double *data = new double[Ndata];
double *par = new double[Npar];
double *func = new double[Nfunc];
int *map = new int[Nmap];
initData(data, Ndata);
initData(par, Npar);
initData(func, Nfunc);
map[0] = 1;
map[1] = 2;
map[2] = 3;
map[3] = 4;
//create timers
DKSTimer serialTimer;
DKSTimer cudaTimer;
DKSTimer ompTimer;
DKSTimer gpuOverhead;
serialTimer.init("Serial timer");
cudaTimer.init("Cuda timer");
ompTimer.init("OpenMP timer");
gpuOverhead.init("Overhead for gpu");
//serial version
double resultSerial;
serialTimer.start();
for (int i = 0; i < Loop; i++)
resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
serialTimer.stop();
//openmp version
double resultOMP = 0.0;
ompTimer.start();
//for (int i = 0; i < Loop; i++)
// resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
ompTimer.stop();
//create and init dkabase
gpuOverhead.start();
DKSBaseMuSR dksbase;
if (api == 1)
dksbase.setAPI("Cuda");
else
dksbase.setAPI("OpenCL");
dksbase.setDevice("-gpu");
dksbase.initDevice();
dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap);
//allocate memory on the device
int ierr;
void *data_ptr;
data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
dksbase.writeData<double>(data_ptr, data, Ndata);
dksbase.writeFunctions(func, Nfunc);
dksbase.writeMaps(map, Nmap);
dksbase.callCompileProgram(funct);
gpuOverhead.stop();
double resultCuda;
cudaTimer.start();
for (int i = 0; i < Loop; i++) {
dksbase.writeParams(par, Npar);
int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap,
0.0, 0.1, 0, resultCuda);
if (ierr != 0)
exit (EXIT_FAILURE);
}
cudaTimer.stop();
std::cout << std::endl;
std::cout << "=======================Results=======================" << std::endl;
std::cout << "Result serial = " << resultSerial << std::endl;
std::cout << "Result prallel = " << resultOMP << std::endl;
std::cout << "Result cuda = " << resultCuda << std::endl;
std::cout << std::endl;
std::cout << "=======================Timings=======================" << std::endl;
serialTimer.print();
ompTimer.print();
cudaTimer.print();
gpuOverhead.print();
std::cout << std::endl;
dksbase.freeMemory<double>(data_ptr, Ndata);
return 0;
}

View File

@ -0,0 +1,248 @@
#include <iostream>
#include <vector>
#include <sys/time.h>
#include "DKSBase.h"
#include <vector_types.h>
#include "cuda_runtime.h"
using namespace std;
typedef struct {
int label;
unsigned localID;
double Rincol[3];
double Pincol[3];
} PART_SMALL;
typedef struct {
double x;
double y;
double z;
} Vector;
PART_SMALL initPartSmall(int d) {
PART_SMALL p;
p.label = 0;
p.localID = d;
p.Rincol[0] = 0.0;
p.Rincol[1] = 0.0;
p.Rincol[2] = 0.02;
p.Pincol[0] = 0.0;
p.Pincol[1] = 0.0;
p.Pincol[2] = 3.9920183237269791e-01;
return p;
}
Vector initVector() {
Vector tmp;
tmp.x = 0.5;
tmp.y = 0.5;
tmp.z = 0.5;
return tmp;
}
void printPart(PART_SMALL p) {
cout << "label: " << p.label << ", ";
cout << "localid: " << p.localID << ",";
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
cout << endl;
}
void printVector(Vector v) {
cout << v.x << "\t" << v.y << "\t" << v.z << endl;
}
void initParts(PART_SMALL *p, int N) {
for (int i = 0; i < N; i++)
p[i] = initPartSmall(i);
}
void printParts(PART_SMALL *p, int N) {
for (int i = 0; i < N; i++)
printPart(p[i]);
cout << endl;
}
void initVectors(Vector *v, int N) {
for (int i = 0; i < N; i++)
v[i] = initVector();
}
void printVectors(Vector *v, int N) {
for (int i = 0; i < N; i++)
printVector(v[i]);
cout << endl;
}
void initParams(double *data) {
data[0] = 0.0;//2.0000000000000000e-02;
data[1] = 1.0;//1.0000000000000000e-02;
data[2] = 2.2100000000000000e+00;
data[3] = 6.0000000000000000e+00;
data[4] = 1.2010700000000000e+01;
data[5] = 2.6010000000000000e+00;
data[6] = 1.7010000000000000e+03;
data[7] = 1.2790000000000000e+03;
data[8] = 1.6379999999999999e-02;
data[9] = 1.9321266968325795e-01;
data[10] = 7.9000000000000000e+01;
data[11] = 1.0000000000000002e-12;
}
void printDouble(double *data, int N) {
for (int i = 0; i < N; i++)
std::cout << data[i] << "\t";
std::cout << std::endl;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of particles: " << numpart << endl;
cout << "Number of loops: " << loop << endl;
cout << "------------------------------------------------------------" << endl;
//init part vector to test mc
PART_SMALL *parts = new PART_SMALL[numpart];
initParts(parts, numpart);
double *params = new double[12];
initParams(params);
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *part_ptr, *param_ptr;
//allocate memory for particles
part_ptr = base.allocateMemory<PART_SMALL>(numpart, ierr);
param_ptr = base.allocateMemory<double>(12, ierr);
//transfer data to device
base.writeData<PART_SMALL>(part_ptr, parts, numpart);
base.writeData<double>(param_ptr, params, 12);
int numaddback;
//test calls to do some first executions
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
base.syncDevice();
//std::cout << "particles to add back: " << numaddback << std::endl;
struct timeval timeStart, timeEnd;
std::cout << "Start MC" << std::endl;
gettimeofday(&timeStart, NULL);
for (int i = 0; i < loop; i++) {
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
base.syncDevice();
}
gettimeofday(&timeEnd, NULL);
std::cout << "addback: " << numaddback << std::endl;
std::cout << "End MC" << std::endl;
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec));
std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl;
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
//read data from device
base.readData<PART_SMALL>(part_ptr, parts, numpart);
//free memory
base.freeMemory<PART_SMALL>(part_ptr, numpart);
base.freeMemory<double>(param_ptr, 12);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
double arx = 0, ary = 0, arz = 0;
double apx = 0, apy = 0, apz = 0;
for (int i = 0; i < numpart; i++) {
arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart;
ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart;
arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart;
apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart;
apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart;
apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart;
}
std::cout << std::fixed << std::setprecision(10);
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
cout << "==========================END TEST==========================" << endl;
return 0;
}

View File

@ -0,0 +1,126 @@
#include <iostream>
#include <vector>
#include "DKSBase.h"
#include "cuda_runtime.h"
#include <mpi.h>
using namespace std;
typedef struct {
int label;
unsigned localID;
double Rincol[3];
double Pincol[3];
long IDincol;
int Binincol;
double DTincol;
double Qincol;
long LastSecincol;
double Bfincol[3];
double Efincol[3];
} PART;
PART initPart(int d) {
PART p;
p.label = d;
p.localID = d;
for (int i = 0; i < 3; i++) {
p.Rincol[i] = 0.5;// / (d+1);
p.Pincol[i] = 0.5;// / (d+1);
p.Bfincol[i] = 1.0 / (d+1);
p.Efincol[i] = 1.0 / (d+1);
}
p.IDincol = d;
p.Binincol = d;
p.DTincol = d;
p.Qincol = d;
p.LastSecincol = d;
return p;
}
void printPart(PART p) {
cout << "label: " << p.label << ", ";
//cout << "localID: " << p.localID << ", ";
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", ";
//cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", ";
//cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", ";
//cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", ";
//cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl;
cout << endl;
}
int main(int argc, char *argv[]) {
int ierr;
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int numpart = 500501;
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
base.callInitRandoms(numpart);
PART tmp;
vector<PART> p;
vector<PART> p_out;
p_out.resize(numpart);
for (int i = 0; i < numpart; i++) {
tmp = initPart(i + 1);
p.push_back(tmp);
}
if (numpart <= 20) {
for (int i = 0; i < 10; i++)
printPart(p[i]);
cout << endl;
}
double params[19];
for (int i = 0; i < 19; i++)
params[i] = 0.05;
params[0] = 0;
params[1] = 1;
void *mem_ptr, *par_ptr;
par_ptr = base.allocateMemory<double>(19, ierr);
base.writeData<double>(par_ptr, params, 19);
mem_ptr = base.allocateMemory<PART>(numpart, ierr);
base.writeData<PART>(mem_ptr, &p[0], numpart);
int addback, dead;
for (int i = 0; i < 100; i++)
base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead);
cout << "Add back: " << addback << ", dead: " << dead << endl;
base.readData<PART>(mem_ptr, &p_out[0], numpart);
base.freeMemory<PART>(mem_ptr, ierr);
base.freeMemory<double>(par_ptr, ierr);
if (numpart <= 20) {
for (int i = 0; i < numpart; i++)
printPart(p_out[i]);
}
MPI_Finalize();
return 0;
}

View File

@ -0,0 +1,250 @@
#include <iostream>
#include <iomanip>
#include <vector>
#include <sys/time.h>
#include "DKSBase.h"
#include <vector_types.h>
#include "cuda_runtime.h"
#include <omp.h>
using namespace std;
typedef struct {
int *label;
unsigned *localID;
double *rx;
double *ry;
double *rz;
double *px;
double *py;
double *pz;
} PART;
void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz,
double *px, double *py, double *pz, int npart) {
for (int i = 0; i < npart; i++) {
label[i] = 0;
localID[i] = i;
rx[i] = 0.0;
ry[i] = 0.0;
rz[i] = 0.02;
px[i] = 0.0;
py[i] = 0.0;
pz[i] = 3.9920183237269791e-01;
}
}
void initParams(double *data) {
data[0] = 0.0;//2.0000000000000000e-02;
data[1] = 1.0;//1.0000000000000000e-02;
data[2] = 2.2100000000000000e+00;
data[3] = 6.0000000000000000e+00;
data[4] = 1.2010700000000000e+01;
data[5] = 2.6010000000000000e+00;
data[6] = 1.7010000000000000e+03;
data[7] = 1.2790000000000000e+03;
data[8] = 1.6379999999999999e-02;
data[9] = 1.9321266968325795e-01;
data[10] = 7.9000000000000000e+01;
data[11] = 1.0000000000000002e-12;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
int threads = 0;
/*
#pragma offload target(mic:0) out(threads)
{
#pragma omp parallel
{
threads = omp_get_num_threads();
}
}
*/
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of particles: " << numpart << endl;
cout << "Number of loops: " << loop << endl;
cout << "Number of threads: " << threads << endl;
cout << "------------------------------------------------------------" << endl;
//init part vector to test mc
//int *label;
//unsigned *localID;
//double *rx, *ry, *rz, *px, *py, *pz;
PART p;
p.label = (int*) _mm_malloc(sizeof(int)*numpart, 64);
p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64);
p.rx = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.ry = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.rz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.px = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.py = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.pz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart);
double *params = new double[12];
initParams(params);
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
//allocate memory for particles
label_ptr = base.allocateMemory<int>(numpart, ierr);
localID_ptr = base.allocateMemory<unsigned>(numpart, ierr);
rx_ptr = base.allocateMemory<double>(numpart, ierr);
ry_ptr = base.allocateMemory<double>(numpart, ierr);
rz_ptr = base.allocateMemory<double>(numpart, ierr);
px_ptr = base.allocateMemory<double>(numpart, ierr);
py_ptr = base.allocateMemory<double>(numpart, ierr);
pz_ptr = base.allocateMemory<double>(numpart, ierr);
param_ptr = base.allocateMemory<double>(12, ierr);
//transfer data to device
base.writeData<int>(label_ptr, p.label, numpart);
base.writeData<unsigned>(localID_ptr, p.localID, numpart);
base.writeData<double>(rx_ptr, p.rx, numpart);
base.writeData<double>(ry_ptr, p.ry, numpart);
base.writeData<double>(rz_ptr, p.rz, numpart);
base.writeData<double>(px_ptr, p.px, numpart);
base.writeData<double>(py_ptr, p.py, numpart);
base.writeData<double>(pz_ptr, p.pz, numpart);
//transfer params to device
base.writeData<double>(param_ptr, params, 12);
std::cout << "test runs" << std::endl;
int numaddback;
//test calls to do some first executions
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
base.syncDevice();
struct timeval timeStart, timeEnd;
std::cout << "Start MC" << std::endl;
gettimeofday(&timeStart, NULL);
for (int i = 0; i < loop; i++) {
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
base.syncDevice();
}
gettimeofday(&timeEnd, NULL);
std::cout << "addback: " << numaddback << std::endl;
std::cout << "End MC" << std::endl;
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec));
std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl;
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
//read data from device
base.readData<int>(label_ptr, p.label, numpart);
base.readData<unsigned>(localID_ptr, p.localID, numpart);
base.readData<double>(rx_ptr, p.rx, numpart);
base.readData<double>(ry_ptr, p.ry, numpart);
base.readData<double>(rz_ptr, p.rz, numpart);
base.readData<double>(px_ptr, p.px, numpart);
base.readData<double>(py_ptr, p.py, numpart);
base.readData<double>(pz_ptr, p.pz, numpart);
//free memory
base.freeMemory<int>(label_ptr, numpart);
base.freeMemory<unsigned>(localID_ptr, numpart);
base.freeMemory<double>(rx_ptr, numpart);
base.freeMemory<double>(ry_ptr, numpart);
base.freeMemory<double>(rz_ptr, numpart);
base.freeMemory<double>(px_ptr, numpart);
base.freeMemory<double>(py_ptr, numpart);
base.freeMemory<double>(pz_ptr, numpart);
base.freeMemory<double>(param_ptr, 12);
/*
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << p.label[i] << "\t" << p.rx[i]
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
}
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++) {
std::cout << p.label[i] << "\t" << p.rx[i]
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
}
double arx = 0, ary = 0, arz = 0;
double apx = 0, apy = 0, apz = 0;
for (int i = 0; i < numpart; i++) {
arx += sqrt(p.rx[i] * p.rx[i]) / numpart;
ary += sqrt(p.ry[i] * p.ry[i]) / numpart;
arz += sqrt(p.rz[i] * p.rz[i]) / numpart;
apx += sqrt(p.px[i] * p.px[i]) / numpart;
apy += sqrt(p.py[i] * p.py[i]) / numpart;
apz += sqrt(p.pz[i] * p.pz[i]) / numpart;
}
std::cout << std::fixed << std::setprecision(10);
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
*/
cout << "==========================END TEST==========================" << endl;
return 0;
}

15
test/testDKS.cpp Normal file
View File

@ -0,0 +1,15 @@
#include <iostream>
#include <complex>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
DKSBase base = DKSBase();
base.getDevices();
return 0;
}

83
test/testFFT.cpp Normal file
View File

@ -0,0 +1,83 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else if (argc == 3) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Begin DKS Base tests" << endl;
int N = 2;
int dimsize[3] = {N, N, N};
complex<double> *cdata = new complex<double>[N];
complex<double> *cfft = new complex<double>[N];
for (int i = 0; i < N; i++) {
cdata[i] = complex<double>(0, 0);
cfft[i] = complex<double>(0, 0);
}
cdata[0] = complex<double>(1.73205, 1.73205);
timestamp_t t0, t1;
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
void *mem_ptr;
int ierr;
/* write data to device */
mem_ptr = base.pushData< complex<double> >( (const void*)cdata, N, ierr);
/* execute fft */
base.callFFT(mem_ptr, 1, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 1, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 1, dimsize);
/* read data from device */
base.pullData< complex<double> >(mem_ptr, cfft, N);
/* print results */
cout << "Data" << endl;
for (int i = 0; i < N; i++)
cout << cdata[i] << "\t";
cout << endl;
cout << "FFT" << endl;
for (int i = 0; i < N; i++)
cout << cfft[i] << "\t";
cout << endl;
return 0;
}

159
test/testFFT3D.cpp Normal file
View File

@ -0,0 +1,159 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
void printData3DN4(complex<double>* &data, int N, int dim);
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
/* usage - ./testFFT3D */
int main(int argc, char *argv[]) {
int N = 16;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
N = atoi(argv[1]);
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
} else if (argc == 3) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, "-gpu");
} else if (argc == 4) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, argv[3]);
} else {
N = 16;
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N, N, N};
cout << "Begin DKS Base tests, N = " << N << endl;
int dim = 3;
complex<double> *cdata = new complex<double>[N*N*N];
complex<double> *cfft = new complex<double>[N*N*N];
complex<double> *cifft = new complex<double>[N*N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
}
}
}
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, dimsize);
void *mem_ptr;
int ierr;
/* allocate memory on device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
/* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
/* execute fft */
base.callFFT(mem_ptr, 3, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 3, dimsize);
/* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
/* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
/* compare results */
compareData(cdata, cifft, N, dim);
return 0;
}
void printData(complex<double>* &data, int N, int dim, bool normalize) {
int ni, nj, nk;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
if (!normalize) {
cout << data[i*ni*ni + j*nj + k].real() << " ";
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
} else
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

199
test/testFFT3DRC.cpp Normal file
View File

@ -0,0 +1,199 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
void initData(double *data, int dimsize[3]);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
void printHelp();
int main(int argc, char *argv[]) {
int N1 = 8;
int N2 = 8;
int N3 = 8;
int dim = 3;
int loop = 10;
if ( readParams(argc, argv, N1, N2, N3, loop) )
return 0;
int dimsize[3] = {N3, N2, N1};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
double *rdata = new double[sizereal];
double *outdata = new double[sizereal];
complex<double> *cfft = new complex<double>[sizecomp];
for (int i=0; i<sizecomp; ++i) {
cfft[i].real() = 7.;
cfft[i].imag() = 3.33;
}
initData(rdata, dimsize);
/* init DKSBase */
cout << "Init device and set function" << endl;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
base.setupFFTRC(dim, dimsize);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
#endif
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
base.setupFFT(dim, dimsize);
#endif
// allocate memory on device
int ierr;
void *real_ptr, *comp_ptr, *real_res_ptr;
real_ptr = base.allocateMemory<double>(sizereal, ierr);
real_res_ptr = base.allocateMemory<double>(sizereal, ierr);
comp_ptr = base.allocateMemory< std::complex<double> >(sizecomp, ierr);
// execute one run before starting the timers
base.writeData<double>(real_ptr, rdata, sizereal);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
base.readData<double>(real_res_ptr, outdata, sizereal);
//timer for total loop time, FFT and IFFT calls
struct timeval timeStart, timeEnd;
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
gettimeofday(&timeStart, NULL);
for (int i=0; i<loop; ++i){
// write data to device
base.writeData<double>(real_ptr, rdata, sizereal);
// execute rcfft
gettimeofday(&timeFFTStart[i], NULL);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeFFTEnd[i], NULL);
// execute crfft
gettimeofday(&timeIFFTStart[i], NULL);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeIFFTEnd[i], NULL);
//normalize
#ifdef DKS_CUDA
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
#endif
// read IFFT data from device
base.readData<double>(real_res_ptr, outdata, sizereal);
}
gettimeofday(&timeEnd, NULL);
// free device memory
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory<double>(real_res_ptr, sizereal);
// compare in and out data to see if we get back the same results
compareData(rdata, outdata, N1, N2, N3, dim);
//calculate seconds for total time and fft times
double tfft = 0;
double tifft = 0;
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
for (int i = 0; i < loop; i++) {
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
}
//print timing results
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
<< "\n\n";
return 0;
}
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
int id;
double sum = 0;
for (int i = 0; i < NI; i++) {
for (int j = 0; j < NJ; j++) {
for (int k = 0; k < NK; k++) {
id = k*NI*NJ + j*NI + i;
sum += fabs(data1[id] - data2[id]);
}
}
}
std::cout << "RC <--> CR diff: " << sum << std::endl;
}
void initData(double *data, int dimsize[3]) {
for (int i = 0; i < dimsize[2]; i++) {
for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) {
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
}
}
}
}
void printHelp() {
std::cout << std::endl;
std::cout << "testFFT3DRC executes 3D real complex and 3D complex real"
<< "function on the Intel MIC.\n";
std::cout << "Operations performed by testRC are: "
<< "write data to MIC -> FFT -> IFFT -> read data from MIC.\n";
std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z "
<< "-loop $l\n";
std::cout << "where $x $y $z are number of elements in each dimension and "
<< "$l is the number of times all the operations will be performed.\n";
std::cout << std::endl;
}
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
for (int i = 1; i < argc; i++) {
if ( argv[i] == std::string("-grid") ) {
N1 = atoi(argv[i + 1]);
N2 = atoi(argv[i + 2]);
N3 = atoi(argv[i + 3]);
i += 3;
}
if ( argv[i] == std::string("-loop") ) {
loop = atoi(argv[i + 1]);
i += 1;
}
if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) {
printHelp();
return true;
}
}
return false;
}

220
test/testFFT3DRC_MIC.cpp Normal file
View File

@ -0,0 +1,220 @@
#include <iostream>
#include <stdlib.h>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
void printData3DN4(complex<double>* &data, int N, int dim);
void printData3DN4(double* data, int N, int dim);
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
void compareData(double* data1, double* data2, int N, int dim);
/* Compute (K*L)%M accurately */
static double moda(int K, int L, int M)
{
return (double)(((long long)K * L) % M);
}
/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */
static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4)
{
double TWOPI = 6.2831853071795864769, phase, factor;
int n1, n2, n3, S1, S2, S3, index;
/* Generalized strides for row-major addressing of x */
S3 = 1;
S2 = (N3/2+1)*2;
S1 = N2*(N3/2+1)*2;
factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0;
for (n1 = 0; n1 < N1; n1++)
{
for (n2 = 0; n2 < N2; n2++)
{
for (n3 = 0; n3 < N3; n3++)
{
phase = moda(n1,H1,N1) / N1;
phase += moda(n2,H2,N2) / N2;
phase += moda(n3,H3,N3) / N3;
index = n1*S1 + n2*S2 + n3*S3;
//cout << "index = " << index << endl;
x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3);
}
}
}
}
int main(int argc, char *argv[]) {
int N = atoi(argv[1]);
int dim = 3;
int dimsize[3] = {N, N, N};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2];
//double *rdata = new double[sizereal];
//double *outdata = new double[sizereal];
//complex<double> *cfft = new complex<double>[sizecomp];
double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
complex<double> *cfft = (complex<double> *)malloc(sizecomp*sizeof(complex<double>));
init_r(rdata, N,N,N);
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
/* setup forward fft (REAL->COMPLEX) */
base.setupFFTRC(dim, dimsize);
int ierr;
void *real_ptr, *comp_ptr;
/* allocate memory on device */;
real_ptr = base.allocateMemory<double>(sizereal, ierr);
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
/* write data to device */
base.writeData<double>(real_ptr, rdata, sizereal);
//printData3DN4(rdata,N,3);
/* execute rcfft */
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
/* read FFT data from device */
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
base.writeData<double>(comp_ptr, cfft, sizereal);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N*N*N));
/* execute crfft */
base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize);
/* normalize */
//base.callNormalizeC2RFFT(real_ptr, dim, dimsize);
/* read FFT data from device */
//base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
/* read IFFT data from device */
base.readData<double>(real_ptr, outdata, sizereal);
/* free device memory */
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal);
/* compare data */
compareData(rdata, outdata, N, dim);
return 0;
}
void printData(complex<double>* &data, int N, int dim, bool normalize) {
int ni, nj, nk;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
if (!normalize)
cout << data[i*ni*ni + j*nj + k].real() << "\t";
else
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
}
}
cout << endl;
}
cout << endl;
}
void printData3DN4(double* data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k];
//double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
//if (a < 10e-5 && a > -10e-5)
// a = 0;
cout << d << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}
void compareData(double* data1, double* data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
//sum += fabs(data1[id] - data2[id]/(N*N*N));
sum += fabs(data1[id] - data2[id]);
}
}
}
cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
}

159
test/testFFT3DSO.cpp Normal file
View File

@ -0,0 +1,159 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
void printData3DN4(complex<double>* &data, int N, int dim);
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
/* usage - ./testFFT3D */
int main(int argc, char *argv[]) {
int N = 16;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
N = atoi(argv[1]);
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
} else if (argc == 3) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, "-gpu");
} else if (argc == 4) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, argv[3]);
} else {
N = 16;
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N, N, N};
cout << "Begin DKS Base tests, N = " << N << endl;
int dim = 3;
complex<double> *cdata = new complex<double>[N*N*N];
complex<double> *cfft = new complex<double>[N*N*N];
complex<double> *cifft = new complex<double>[N*N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
}
}
}
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, dimsize);
void *mem_ptr;
int ierr;
/* allocate memory on device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
/* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
/* execute fft */
base.callFFT(mem_ptr, 3, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 3, dimsize);
/* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
/* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
/* compare results */
compareData(cdata, cifft, N, dim);
return 0;
}
void printData(complex<double>* &data, int N, int dim, bool normalize) {
int ni, nj, nk;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
if (!normalize) {
cout << data[i*ni*ni + j*nj + k].real() << " ";
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
} else
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

130
test/testFFT3DTiming.cpp Normal file
View File

@ -0,0 +1,130 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
int main(int argc, char *argv[]) {
int N = 4;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else if (argc > 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
N = atoi(argv[3]);
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
int dimsize[3] = {N, N, N};
cout << "Use api: " << api_name << endl;
cout << "Begin DKS Base tests, N = " << N << endl;
complex<double> *cdata = new complex<double>[N*N*N];
complex<double> *cfft = new complex<double>[N*N*N];
complex<double> *cifft = new complex<double>[N*N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cdata[i*N*N + j*N + k] = complex<double>((double)i / N, 0);
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
}
}
}
timestamp_t t0, t1;
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
void *mem_ptr;
int ierr;
/* run stest funct to init device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
base.callFFT(mem_ptr, 3, dimsize);
base.callIFFT(mem_ptr, 3, dimsize);
base.callNormalizeFFT(mem_ptr, 3, dimsize);
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
/* end test */
int steps = 10;
base.oclClearEvents();
t0 = get_timestamp();
for (int i = 0; i < steps; i++) {
/* allocate memory on device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
/* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
/* execute fft */
base.callFFT(mem_ptr, 3, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 3, dimsize);
/* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
/* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N);
//compareData(cdata, cifft, N, 3);
}
t1 = get_timestamp();
cout << "=========================" << endl;
//base.oclEventInfo();
cout << "Average total: " << get_secs(t0, t1) / steps << endl;
cout << "=========================" << endl;
return 0;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

117
test/testFFTAsync.cpp Normal file
View File

@ -0,0 +1,117 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include <cufft.h>
#include <cuda_runtime.h>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void initData(double *data, int dimsize[3]) {
for (int i = 0; i < dimsize[2]; i++) {
for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) {
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
}
}
}
}
int main(int argc, char *argv[]) {
int N = 8;
if (argc == 2)
N = atoi(argv[1]);
int N1 = N;
int N2 = N;
int N3 = N;
int dim = 3;
int dimsize[3] = {N3, N2, N1};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = dimsize[0] * dimsize[1] * (dimsize[2]/2+1);
double *data1 = new double[sizereal];
double *data2 = new double[sizereal];
initData(data1, dimsize);
initData(data2, dimsize);
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
base.setupFFT(3, dimsize);
/* pagelock data */
base.allocateHostMemory(data1, sizereal);
base.allocateHostMemory(data2, sizereal);
/* create streams */
int fft1, fft2;
base.createStream(fft1);
base.createStream(fft2);
int ierr;
void *real_ptr1, *real_ptr2, *comp_ptr1, *comp_ptr2;
cout << "allocating memory ..." << endl;
/* allocate memory on device */;
real_ptr1 = base.allocateMemory<double>(sizereal, ierr);
real_ptr2 = base.allocateMemory<double>(sizereal, ierr);
comp_ptr1 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
comp_ptr2 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
cufftHandle defaultPlan;
cudaStream_t cfft1, cfft2;
cufftPlan3d(&defaultPlan, N1, N2, N3, CUFFT_D2Z);
cudaStreamCreate(&cfft1);
cudaStreamCreate(&cfft2);
for (int i = 0; i < 5; i++) {
cufftHandle plan = defaultPlan;
cout << "Iteration: " << i << endl;
/* write data to device */
base.writeDataAsync<double>(real_ptr1, data1, sizereal, fft1);
//cudaMemcpyAsync( (double*)real_ptr1,data1,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft1);
/* execute rcfft */
base.callR2CFFT(real_ptr1, comp_ptr1, dim, dimsize, fft1);
//cufftSetStream(plan, cfft1);
//cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr1, (cufftDoubleComplex*)comp_ptr2);
/* write data to device */
base.writeDataAsync<double>(real_ptr2, data2, sizereal, fft2);
//cudaMemcpyAsync( (double*)real_ptr2,data2,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft2);
/* execute rcfft */
base.callR2CFFT(real_ptr2, comp_ptr2, dim, dimsize, fft2);
//cufftSetStream(plan, cfft2);
//cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr2, (cufftDoubleComplex*)comp_ptr2);
}
base.freeMemory<double>(real_ptr1, sizereal);
base.freeMemory<double>(real_ptr2, sizereal);
base.freeMemory< complex<double> >(comp_ptr1, sizereal);
base.freeMemory< complex<double> >(comp_ptr2, sizereal);
/* free pagelock data */
base.freeHostMemory(data1, sizereal);
base.freeHostMemory(data2, sizereal);
return 0;
}

301
test/testFFTSolver.cpp Normal file
View File

@ -0,0 +1,301 @@
#include <iostream>
#include <mpi.h>
#include <string.h>
#include "DKSBase.h"
#include "nvToolsExt.h"
#include "cuda_profiler_api.h"
#include "cuda_runtime.h"
using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1;
}
}
}
}
void initData2(double *data, int N) {
for (int i = 0; i < N; i++)
data[i] = i;
}
void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0);
}
}
void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << "\t";
cout << endl;
}
void initMirror(double *data, int n1, int n2, int n3) {
int d = 1;
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++;
else
data[i * n2 * n1 + j * n1 + k] = 0;
}
}
}
}
void printDiv(int c) {
for (int i = 0; i < c; i++)
cout << "-";
cout << endl;
}
void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75);
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
}
cout << endl;
}
cout << endl;
}
cout << endl;
}
double sumData(double *data, int datasize) {
double sum = 0;
for (int i = 0; i < datasize; i++)
sum += data[i];
return sum;
}
int main(int argc, char *argv[]) {
/* mpi init */
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
if (nprocs != 8) {
cout << "example was set to run with 8 processes" << endl;
cout << "exit..." << endl;
return 0;
}
/* set domain size */
int NG[3] = {64, 64, 32};
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
int id[3];
id[0] = 0;
id[1] = NL[1] * (rank % 4);
id[2] = NL[2] * (rank / 4);
/* print some messages bout the example in the begginig */
if (rank == 0) {
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
int tmp[3];
for (int p = 1; p < nprocs; p++) {
MPI_Status mpistatus;
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
}
} else {
MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
}
/* dks init and create 2 streams */
int dkserr;
int streamGreens, streamFFT;
DKSBase base;// = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
base.createStream(streamFFT);
if (rank == 0) {
base.createStream(streamGreens);
base.setupFFT(3, NG);
}
/* allocate memory and init rho field */
double *rho = new double[sizerho];
double *rho_out = new double[sizerho];
//double *green_out = new double[sizegreen];
initMirror(rho, NL[0], NL[1], NL[2]);
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
if (rank == 0) {
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
} else {
grntr_ptr = NULL;
rho2_ptr = NULL;
grn_ptr = NULL;
rho2tr_ptr = NULL;
tmpgreen_ptr = NULL;
}
/* send and receive pointer to allocated memory on device */
if (rank == 0) {
for (int p = 1; p < nprocs; p++)
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
} else {
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
}
MPI_Barrier(MPI_COMM_WORLD);
/* =================================================*/
/* =================================================*/
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
double old_sum = 0;
double tmp_sum = 0;
for (int l = 0; l < 10000; l++) {
MPI_Barrier(MPI_COMM_WORLD);
/* on node 0, calculate tmpgreen on gpu */
int hr_m[3] = {1, 1, 1};
if (rank == 0)
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
hr_m[0], hr_m[1], hr_m[2], streamGreens);
/* calculate greens integral on gpu */
if (rank == 0)
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2], streamGreens);
/* mirror the field */
if (rank == 0)
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2], streamGreens);
/* get FFT of mirrored greens integral */
if (rank == 0)
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG, streamGreens);
/* transfer rho field to device */
base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
MPI_Barrier(MPI_COMM_WORLD);
/* get FFT of rho field */
if (rank == 0) {
base.syncDevice();
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
}
/* multiply both FFTs */
if (rank == 0)
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
MPI_Barrier(MPI_COMM_WORLD);
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
if (rank == 0) {
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
base.syncDevice();
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
base.syncDevice();
MPI_Barrier(MPI_COMM_WORLD);
//cout << "result: " << sumData(rho_out, sizerho) << endl;
if (l == 0) {
old_sum = sumData(rho_out, sizerho);
} else {
tmp_sum = sumData(rho_out, sizerho);
if (old_sum != tmp_sum) {
cout << "diff in iteration: " << l << endl;
}
}
} else {
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
}
/* =================================================*/
/* =================================================*/
/* ==========end fftpoison solver test run==========*/
/* =================================================*/
/* =================================================*/
/* free memory on device */
if (rank == 0) {
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(rho2_ptr, sizerho);
cout << "Final sum: " << old_sum << endl;
} else {
base.closeHandle(rho2_ptr);
MPI_Barrier(MPI_COMM_WORLD);
}
MPI_Finalize();
}

319
test/testFFTSolver_MIC.cpp Normal file
View File

@ -0,0 +1,319 @@
#include <iostream>
//#include <mpi.h>
#include <string.h>
#include "DKSBase.h"
#include "nvToolsExt.h"
#include "cuda_profiler_api.h"
#include "cuda_runtime.h"
using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1;
}
}
}
}
void initData2(double *data, int N) {
for (int i = 0; i < N; i++)
data[i] = i;
}
void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0);
}
}
void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << "\t";
cout << endl;
}
void initMirror(double *data, int n1, int n2, int n3) {
int d = 1;
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++;
else
data[i * n2 * n1 + j * n1 + k] = 0;
}
}
}
}
void printDiv(int c) {
for (int i = 0; i < c; i++)
cout << "-";
cout << endl;
}
void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75);
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
}
cout << endl;
}
cout << endl;
}
cout << endl;
}
double sumData(double *data, int datasize) {
double sum = 0;
for (int i = 0; i < datasize; i++)
sum += data[i];
return sum;
}
int main(int argc, char *argv[]) {
/* mpi init */
//int rank, nprocs;
//MPI_Init(&argc, &argv);
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/*
if (nprocs != 8) {
cout << "example was set to run with 8 processes" << endl;
cout << "exit..." << endl;
return 0;
}
*/
/* set domain size */
int NG[3] = {64, 64, 32};
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
int id[3];
//id[0] = 0;
//id[1] = NL[1] * (rank % 4);
//id[2] = NL[2] * (rank / 4);
/* print some messages bout the example in the begginig */
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
int tmp[3];
/* for (int p = 1; p < nprocs; p++) {
MPI_Status mpistatus;
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
}*/
// } else {
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
// }
/* dks init and create 2 streams */
int dkserr;
//int streamGreens, streamFFT;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
#endif
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
#endif
//base.createStream(streamFFT);
//if (rank == 0) {
// base.createStream(streamGreens);
base.setupFFT(3, NG);
//}
/* allocate memory and init rho field */
double *rho = new double[sizerho];
double *rho_out = new double[sizerho];
//double *green_out = new double[sizegreen];
initMirror(rho, NL[0], NL[1], NL[2]);
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
// if (rank == 0) {
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* } else {
grntr_ptr = NULL;
rho2_ptr = NULL;
grn_ptr = NULL;
rho2tr_ptr = NULL;
tmpgreen_ptr = NULL;
}*/
/* send and receive pointer to allocated memory on device */
/*
if (rank == 0) {
for (int p = 1; p < nprocs; p++)
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
} else {
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
}
MPI_Barrier(MPI_COMM_WORLD);
*/
/* =================================================*/
/* =================================================*/
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
double old_sum = 0;
double tmp_sum = 0;
for (int l = 0; l < 100; l++) {
//MPI_Barrier(MPI_COMM_WORLD);
/* on node 0, calculate tmpgreen on gpu */
int hr_m[3] = {1, 1, 1};
//if (rank == 0)
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
hr_m[0], hr_m[1], hr_m[2]);
/* calculate greens integral on gpu */
//if (rank == 0)
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
/* mirror the field */
//if (rank == 0)
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
/* get FFT of mirrored greens integral */
//if (rank == 0)
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* transfer rho field to device */
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
/* get FFT of rho field */
//if (rank == 0) {
//base.syncDevice();
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
//}
/* multiply both FFTs */
//if (rank == 0)
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
//if (rank == 0) {
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//cout << "result: " << sumData(rho_out, sizerho) << endl;
if (l == 0) {
old_sum = sumData(rho_out, sizerho);
} else {
tmp_sum = sumData(rho_out, sizerho);
if (old_sum != tmp_sum) {
cout << "diff in iteration: " << l << endl;
}
}
/*} else {
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
*/
}
/* =================================================*/
/* =================================================*/
/* ==========end fftpoison solver test run==========*/
/* =================================================*/
/* =================================================*/
/* free memory on device */
//if (rank == 0) {
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(rho2_ptr, sizerho);
cout << "Final sum: " << old_sum << endl;
/*} else {
base.closeHandle(rho2_ptr);
MPI_Barrier(MPI_COMM_WORLD);
}*/
//MPI_Finalize();
}

172
test/testGather.cpp Normal file
View File

@ -0,0 +1,172 @@
#include <iostream>
#include <mpi.h>
#include <string.h>
#include "nvToolsExt.h"
#include "cuda_profiler_api.h"
#include "DKSBase.h"
using namespace std;
void printData3D(int* data, int N, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < nz; i++) {
for (int j = 0; j < ny; j++) {
for (int k = 0; k < nx; k++) {
cout << data[i*ny*nx + j*nx + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData(int *data, int N, int nprocs, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < nprocs; i++) {
for (int j = 0; j < N; j++)
cout << data[i*N + j] << "\t";
cout << endl;
}
}
void initData(int *data, int N, int rank) {
for (int i = 0; i < N; i++)
data[i] = (rank+1);
}
int main(int argc, char *argv[]) {
int ierr;
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
int N_global[3] = {64, 64, 32};
int N_local[3] = {64, 32, 16};
int n = N_local[0] * N_local[1] * N_local[2];
int idx[4] = {0, 0, 0, 0};
int idy[4] = {0, 32, 0, 32};
int idz[4] = {0, 0, 16, 16};
DKSBase base = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
int *hdata_in;
if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
hdata_in = new int[n];
cout << "pinned allocation failed!" << endl;
}
initData(hdata_in, n, rank);
for (int i = 0; i < 2; i++) {
MPI_Barrier(MPI_COMM_WORLD);
if (i == 1)
nvtxMarkA("start gather");
if (rank == 0) {
void *mem_ptr, *tmpgreen_ptr;
mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
//call another kernel
int sizegreen = 33 * 33 * 17;
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);
nvtxMarkA("call green");
base.callGreensIntegral(tmpgreen_ptr, 33, 33, 17, 33, 33, 0.001, 0.001, 0.00007);
nvtxMarkA("call gather");
base.gather3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local,
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
//read and print data once for debug only
/*
if (i == 0 && nprocs*n < 257) {
int *hdata_out_all = new int[nprocs*n];
base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
printData3D2(hdata_out_all, N_global[0], N_global[1], N_global[2]);
}
else {
int *hout_data = new int[nprocs*n];
base.readData<int>(mem_ptr, hout_data, nprocs*n);
int sum = 0;
for (int s = 0; s < nprocs*n; s++)
sum += hout_data[s];
cout << "Sum: " << sum << endl;
}
*/
MPI_Barrier(MPI_COMM_WORLD);
nvtxMarkA("call scatter");
base.scatter3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local,
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
base.freeMemory<int>(mem_ptr, n*nprocs);
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
} else {
nvtxMarkA("call gather");
base.gather3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local,
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
nvtxMarkA("call scatter");
base.scatter3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local,
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
}
if (i == 1)
nvtxMarkA("end gather");
}
MPI_Barrier(MPI_COMM_WORLD);
base.freeHostMemory(hdata_in, n);
MPI_Finalize();
return 0;
}

144
test/testGatherAsync.cpp Normal file
View File

@ -0,0 +1,144 @@
#include <iostream>
#include <mpi.h>
#include <string.h>
#include "nvToolsExt.h"
#include "cuda_profiler_api.h"
#include "DKSBase.h"
using namespace std;
void printData3D(int* data, int N, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData(int *data, int N, int nprocs, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < nprocs; i++) {
for (int j = 0; j < N; j++)
cout << data[i*N + j] << "\t";
cout << endl;
}
}
void initData(int *data, int N, int rank) {
for (int i = 0; i < N; i++)
data[i] = (rank+1);
}
int main(int argc, char *argv[]) {
int ierr;
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
//mpi copy
int n = 32*16*16;
int N_global[3] = {32, 32, 32};
int N_local[3] = {32, 16, 16};
int idx[4] = {0, 0, 0, 0};
int idy[4] = {0, 0, 16, 16};
int idz[4] = {0, 16, 0, 16};
//greens kernel
int n1 = 33;
int n2 = 33;
int n3 = 17;
int sizegreen = n1*n2*n3;
DKSBase base = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
int *hdata_in;
if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
hdata_in = new int[n];
cout << "pinned allocation failed!" << endl;
}
initData(hdata_in, n, rank);
int stream2;
for (int i = 0; i < 2; i++) {
if (rank == 0) {
if (i == 0) {
cudaProfilerStart();
base.createStream(stream2);
}
nvtxMarkA("start gather");
void *mem_ptr, *green_ptr;
mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
green_ptr = base.allocateMemory<int>(sizegreen, ierr);
nvtxMarkA("call gather");
MPI_Request request;
MPI_Status status;
base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local,
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD,
request);
nvtxMarkA("call kernel");
base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1,
4.160715e-03, 4.474911e-03, 1.247311e-02, stream2);
MPI_Wait(&request, &status);
base.freeMemory<int>(mem_ptr, n*nprocs);
base.freeMemory<int>(green_ptr, sizegreen);
MPI_Barrier(MPI_COMM_WORLD);
nvtxMarkA("end gather");
if (i == 1) cudaProfilerStop();
} else {
MPI_Request request;
base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local,
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD,
request);
MPI_Barrier(MPI_COMM_WORLD);
}
}
base.freeHostMemory(hdata_in, n);
MPI_Finalize();
return 0;
}

205
test/testGatherAsync2.cpp Normal file
View File

@ -0,0 +1,205 @@
#include <iostream>
#include <mpi.h>
#include <string.h>
#include "nvToolsExt.h"
#include "cuda_profiler_api.h"
#include "DKSBase.h"
using namespace std;
void printData3D(int* data, int N, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < nz; i++) {
for (int j = 0; j < ny; j++) {
for (int k = 0; k < nx; k++) {
cout << data[i*ny*nx + j*nx + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData(int *data, int N, int nprocs, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < nprocs*N; i++)
cout << data[i] << "\t";
cout << endl << endl;
}
void initData(int *data, int N, int rank) {
for (int i = 0; i < N; i++)
data[i] = (rank+1);
}
int main(int argc, char *argv[]) {
int ierr;
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
//cout << "Rank " << (rank+1) << " from " << nprocs << endl;
int Ng[3] = {128, 128, 64};
int Nl[3] = {128, 64, 32};
int nglobal = Ng[0] * Ng[1] * Ng[2];
int nlocal = Nl[0] * Nl[1] * Nl[2];
DKSBase base = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
int *hdata_in;
if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) {
hdata_in = new int[nlocal];
cout << "pinned allocation failed!" << endl;
}
initData(hdata_in, nlocal, rank);
int *hdata_out;
if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) {
hdata_out = new int[nlocal];
cout << "pinned allocation failed!" << endl;
}
//create streams for async execution
int stream1, stream2;
base.createStream(stream1);
base.createStream(stream2);
if (rank == 0)
base.setupFFT(3, Ng);
for (int i = 0; i < 1; i++) {
MPI_Barrier(MPI_COMM_WORLD);
if (i == 1)
nvtxMarkA("start gather");
if (rank == 0) {
int id[3] = {0, 0, 0};
void *mem_ptr, *tmpgreen_ptr, *comp_ptr;
//allocate memory on device
int sizegreen = 65 * 65 * 33;
int sizecomp = 65 * 128 * 64;
mem_ptr = base.allocateMemory<double>(nglobal, ierr);
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
//send pointer to other processes
nvtxMarkA("call gather");
for (int j = 1; j < nprocs; j++)
base.sendPointer(mem_ptr, j, MPI_COMM_WORLD);
//call another kernel while data transfer is processing
nvtxMarkA("call green");
base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2);
//write data to device
base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
/* execute rcfft */
//base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng);
base.syncDevice();
MPI_Barrier(MPI_COMM_WORLD);
//read data from device
base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
MPI_Barrier(MPI_COMM_WORLD);
base.syncDevice();
MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(mem_ptr, nglobal);
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
} else {
void *mem_ptr;
int idy = 0;
int idz = 0;//Nl[2]*rank;
if (rank / 2 == 1) idy = Ng[1] / 2;
if (rank % 2 == 1) idz = Ng[2] / 2;
int id[3] = {0, idy, idz};
nvtxMarkA("call gather");
mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr);
base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
base.closeHandle(mem_ptr);
}
int sum1 = 0;
for (int c = 0; c < nlocal; c++)
sum1 += hdata_in[c];
int sum2 = 0;
for (int c = 0; c < nlocal; c++)
sum2 += hdata_out[c];
cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl;
if (i == 1)
nvtxMarkA("end gather");
}
//printData(hdata_in, nlocal, 1);
MPI_Barrier(MPI_COMM_WORLD);
base.freeHostMemory(hdata_in, nlocal);
//delete[] hdata_in;
MPI_Finalize();
return 0;
}

239
test/testGreens.cpp Normal file
View File

@ -0,0 +1,239 @@
#include <iostream>
#include <mpi.h>
#include <string.h>
#include <complex>
#include "DKSBase.h"
#include "nvToolsExt.h"
#include "cuda_profiler_api.h"
#include "cuda_runtime.h"
using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1;
}
}
}
}
void initData2(double *data, int N) {
for (int i = 0; i < N; i++)
data[i] = i;
}
void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0);
}
}
void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << "\t";
cout << endl;
}
void initMirror(double *data, int n1, int n2, int n3) {
int d = 1;
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++;
else
data[i * n2 * n1 + j * n1 + k] = 0;
}
}
}
}
void printDiv(int c) {
for (int i = 0; i < c; i++)
cout << "-";
cout << endl;
}
void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75);
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
}
cout << endl;
}
cout << endl;
}
cout << endl;
}
double sumData(double *data, int datasize) {
double sum = 0;
for (int i = 0; i < datasize; i++)
sum += data[i];
return sum;
}
int main(int argc, char *argv[]) {
int ierr;
int N1 = 8;
int N2 = 8;
int N3 = 4;
int n1 = N1 / 2;
int n2 = N2 / 2;
int n3 = N3 / 2;
int sizegreen = (n1 + 1) * (n2 + 1) * (n3 + 1);
int sizerho = N1 * N2 * N3;
double *data_green; //= new double[sizegreen];
double *data_rho; //= new double[sizerho];
double hr_m0 = +4.0264984513873269e-04;
double hr_m1 = +4.3305596731911289e-04;
double hr_m2 = +8.3154085085560838e-04;
DKSBase base = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
int stream1, stream2;
base.createStream(stream1);
base.createStream(stream2);
cout << "ID stream1: " << stream1 << endl;
cout << "ID stream2: " << stream2 << endl;
void *mem_green1, *mem_green2, *mem_rho1, *mem_rho2;
mem_green1 = base.allocateMemory<double>(sizegreen, ierr);
mem_green2 = base.allocateMemory<double>(sizegreen, ierr);
mem_rho1 = base.allocateMemory<double>(sizerho, ierr);
mem_rho2 = base.allocateMemory<double>(sizerho, ierr);
printDiv(50);
data_green = new double[sizegreen];
data_rho = new double[sizerho];
base.callGreensIntegral(mem_green1, n1+1, n2+1, n3+1, n1+1, n2+1,
hr_m0, hr_m1, hr_m2, stream1);
base.readData<double>(mem_green1, data_green, sizegreen);
cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
cout << scientific << setprecision(16);
for (int p = 0; p < 7; p++)
cout << data_green[p] << "\t";
cout << endl;
//printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
base.callGreensIntegration(mem_rho1, mem_green1, n1 + 1, n2 + 1, n3 + 1, -1);
base.readData<double>(mem_rho1, data_rho, sizerho);
cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
//printMirror(data_rho, N1, N2, N3);
base.callMirrorRhoField(mem_rho1, n1, n2, n3, -1);
base.readData<double>(mem_rho1, data_rho, sizerho);
cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
//printMirror(data_rho, N1, N2, N3);
printDiv(50);
/*
base.callGreensIntegral(mem_green2, n1+1, n2+1, n3+1, n1+1, n2+1,
1, 1, 1, -2);
base.readData<double>(mem_green2, data_green, sizegreen);
cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
//printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
base.callGreensIntegration(mem_rho2, mem_green2, n1 + 1, n2 + 1, n3 + 1, -2);
base.readData<double>(mem_rho2, data_rho, sizerho);
cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
//printMirror(data_rho, N1, N2, N3);
base.callMirrorRhoField(mem_rho2, n1, n2, n3, -2);
base.readData<double>(mem_rho2, data_rho, sizerho);
cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
//printMirror(data_rho, N1, N2, N3);
*/
printDiv(50);
base.freeMemory<double>(mem_green1, sizegreen);
base.freeMemory<double>(mem_green2, sizegreen);
base.freeMemory<double>(mem_rho1, sizerho);
base.freeMemory<double>(mem_rho2, sizerho);
delete [] data_green;
delete [] data_rho;
//test complex multiplication
int compsize = 300;
complex<double> *data1 = new complex<double>[compsize];
complex<double> *data2 = new complex<double>[compsize];
for (int i = 0; i < compsize; i++) {
data1[i] = complex<double>(i+1, i+2);
data2[i] = complex<double>(i+3, i+4);
}
for (int i = 0; i < 3; i++)
cout << data1[i] << "\t";
cout << endl;
for (int i = 0; i < 3; i++)
cout << data2[i] << "\t";
cout << endl;
void *ptr1, *ptr2;
ptr1 = base.allocateMemory< complex<double> >(compsize, ierr);
ptr2 = base.allocateMemory< complex<double> >(compsize, ierr);
base.writeData< complex<double> >(ptr1, data1, compsize);
base.writeData< complex<double> >(ptr2, data2, compsize);
base.callMultiplyComplexFields(ptr1, ptr2, compsize);
base.readData< complex<double> >(ptr1, data1, compsize);
for (int i = 0; i < 3; i++)
cout << data1[i] << "\t";
cout << endl;
base.freeMemory< complex<double> >(ptr1, compsize);
base.freeMemory< complex<double> >(ptr2, compsize);
return 0;
}

View File

@ -0,0 +1,191 @@
#include <iostream>
#include <cstdlib>
#include <sys/time.h>
#include "DKSImageReconstruction.h"
struct voxelPosition {
float x;
float y;
float z;
};
void initImage(float *image, int size) {
for (int i = 0; i < size; i++)
image[i] = (float)rand() / RAND_MAX;
}
void initPosition(voxelPosition *voxel, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
int idx = i * N * N + j * N + k;
if (k == 0)
voxel[idx].x = 0.0;
else
voxel[idx].x = voxel[idx - 1].x + 0.1;
if (j == 0)
voxel[idx].y = 0.0;
else
voxel[idx].y = voxel[idx - N].y + 0.1;
if (i == 0)
voxel[idx].z = 0.0;
else
voxel[idx].z = voxel[idx - N * N].z + 0.1;
}
}
}
}
void printPosition(voxelPosition *voxel, int size) {
for (int i = 0; i < size; i++)
std::cout << voxel[i].x << "\t";
std::cout << std::endl;
for (int i = 0; i < size; i++)
std::cout << voxel[i].y << "\t";
std::cout << std::endl;
for (int i = 0; i < size; i++)
std::cout << voxel[i].z << "\t";
std::cout << std::endl;
}
#define DIAMETER 2.0
bool select_source(voxelPosition *image_tmp, voxelPosition source_temp, int id)
{
float distance_x = pow(image_tmp[id].x-source_temp.x,2);
float distance_y = pow(image_tmp[id].y-source_temp.y,2);
float distance_z = pow(image_tmp[id].z-source_temp.z,2);
float distance = sqrt(distance_x + distance_y + distance_z);
if ( distance < DIAMETER*0.5 ) {
return true;
}
else
return false;
}
void calculate_source(float *image_space , voxelPosition *image_geometry,
voxelPosition source, int total_voxels,
float *average, float *std)
{
int number_selected_maximum = 10000;
float *select;
select = new float[number_selected_maximum];
for (int j=0;j<number_selected_maximum;j++)
select[j] = 0.0;
int number_selected=0;
for (int voxel_id = 0; voxel_id < total_voxels; voxel_id++) {
if ( select_source( image_geometry, source, voxel_id ) ) {
select[number_selected] = image_space[voxel_id];
number_selected += 1;
}
}
*average = 0.0;
*std = 0.0;
for (int j=0;j<number_selected;j++)
*average += select[j];
*average /= float(number_selected);
for (int j=0;j<number_selected;j++)
*std += pow(*average-select[j],2);
*std = sqrt(*std/number_selected/(number_selected-1));
delete[] select;
}
int main(int argc, char *argv[]) {
int N = 8;
if (argc == 2)
N = atoi(argv[1]);
double ttotal;
struct timeval timeStart, timeEnd;
int total = N*N*N;
float *image = new float[total];
voxelPosition *geometry = new voxelPosition[total];
initImage(image, total);
initPosition(geometry, N);
voxelPosition source;
float avg[total], stdev[total];
gettimeofday(&timeStart, NULL);
for (int i = 0; i < total; i++) {
source.x = geometry[i].x;
source.y = geometry[i].y;
source.z = geometry[i].z;
calculate_source(image , geometry, source, total, &avg[i], &stdev[i]);
}
gettimeofday(&timeEnd, NULL);
ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
float avgavg = 0;
float avgstdev = 0;
for (int i = 0; i < total; i++) {
avgavg += avg[i] / total;
avgstdev += stdev[i] / total;
}
std::cout << "Total voxels: " << N*N*N << std::endl;
std::cout << "Dimensions [" << geometry[0].x << ":" << geometry[N-1].x << "]"
<< "[" << geometry[0].y << ":" << geometry[N*N-1].x << "]"
<< "[" << geometry[0].z << ":" << geometry[N*N*N-1].x << "]" << std::endl;
std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
void *image_space, *image_position, *source_position, *davg, *dstd;
int ierr;
DKSImageRecon base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
image_space = base.allocateMemory<float>(total, ierr);
image_position = base.allocateMemory<voxelPosition>(total, ierr);
source_position = base.allocateMemory<voxelPosition>(total, ierr);
davg = base.allocateMemory<float>(total, ierr);
dstd = base.allocateMemory<float>(total, ierr);
base.writeData<float>(image_space, image, total);
base.writeData<voxelPosition>(image_position, geometry, total);
base.writeData<voxelPosition>(source_position, geometry, total);
gettimeofday(&timeStart, NULL);
base.callCalculateSource(image_space, image_position, source_position,
davg, dstd, DIAMETER, total, total);
base.readData<float>(davg, avg, total);
base.readData<float>(dstd, stdev, total);
gettimeofday(&timeEnd, NULL);
ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
base.freeMemory<float>(image_space, total);
base.freeMemory<voxelPosition>(image_position, total);
base.freeMemory<voxelPosition>(source_position, total);
base.freeMemory<float>(dstd, total);
base.freeMemory<float>(davg, total);
avgavg = 0;
avgstdev = 0;
for (int i = 0; i < total; i++) {
avgavg += avg[i] / total;
avgstdev += stdev[i] / total;
}
std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
return N;
}

51
test/testMIC.cpp Normal file
View File

@ -0,0 +1,51 @@
#include <iostream>
#include "DKSBase.h"
using namespace std;
int main() {
DKSBase base;
base.setAPI("OpenMP", 6);
base.initDevice();
//init data
int ierr;
int N = 8;
double *in_data = new double[N];
double *in_data2 = new double[N];
double *out_data = new double[N];
double *out_data2 = new double[N];
for (int i = 0; i < N; i++) {
in_data[i] = i;
in_data2[i] = i*i;
}
//test memory allocation, write and read operations
void *d_ptr, *d2_ptr;
d_ptr = base.allocateMemory<double>(N, ierr);
d2_ptr = base.allocateMemory<double>(N, ierr);
base.writeData<double>(d_ptr, in_data, N);
base.writeData<double>(d2_ptr, in_data2, N);
base.readData<double>(d_ptr, out_data, N);
base.readData<double>(d2_ptr, out_data2, N);
base.freeMemory<double>(d_ptr, N);
base.freeMemory<double>(d2_ptr, N);
//print results
for (int i = 0; i < N; i++)
cout << out_data[i] << "\t";
cout << endl;
for (int i = 0; i < N; i++)
cout << out_data2[i] << "\t";
cout << endl;
return 0;
}

94
test/testMICOpenCL.cpp Normal file
View File

@ -0,0 +1,94 @@
#include <iostream>
#include <cstdlib>
#include "DKSBase.h"
#include "Utility/TimeStamp.h"
using namespace std;
int main(int argc, char *argv[]) {
char *api_name = new char[10];
char *device_name = new char[4];
if (argc == 3) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
} else if (argc == 2){
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << endl;
cout << "Use device: " << device_name << endl;
int ierr;
int N = 10000;
double *data = new double[N];
double *data_out = new double[N];
double *data_out2 = new double[N];
for (int i = 0; i < N; i++) {
data[i] = i;
}
//init dks base class, set API to opencl and init connection with OpenCL device
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
//data ptr
void *data_ptr, *data_ptr2;
//allocate memory
data_ptr = base.allocateMemory<double>(N, ierr);
data_ptr2 = base.allocateMemory<double>(N, ierr);
//write data to memory and fill data on device
base.writeData<double>(data_ptr, data, N);
base.writeData<double>(data_ptr2, data, N);
//base.callNt<double>(data_ptr2, data_ptr, 6, N, 1, 0);
//calc sum
base.callSum<double>(data_ptr2, data_ptr2, N);
//base.callSum<double>(data_ptr, data_ptr, N);
//chi^2
//base.callChi2<double>(data_ptr, data_ptr, data_ptr, N);
//base.callChi2<double>(data_ptr2, data_ptr2, data_ptr2, N);
//read data
base.readData<double>(data_ptr, data_out, N);
base.readData<double>(data_ptr2, data_out2, N);
//base.oclEventInfo();
//free memory
base.freeMemory<double>(data_ptr, N);
base.freeMemory<double>(data_ptr2, N);
/*
for (int i = 0; i < N; i++) {
cout << data[i] << "\t";
}
cout << endl << endl;
for (int i = 0; i < N; i++) {
cout << data_out[i] << "\t";
}
cout << endl << endl;
for (int i = 0; i < N; i++) {
cout << data_out2[i] << "\t";
}
cout << endl;
*/
return 0;
}

68
test/testMICPush.cpp Normal file
View File

@ -0,0 +1,68 @@
#include <iostream>
#include <cstdlib>
#include "DKSBase.h"
using namespace std;
typedef struct {
double x;
double y;
double z;
} Part;
void initData(Part *data, int N) {
for (int i = 0; i < N; i++) {
data[i].x = rand() / RAND_MAX;
data[i].y = rand() / RAND_MAX;
data[i].z = rand() / RAND_MAX;
}
}
int main() {
int ierr;
int N = 100000;
//__declspec(align(64)) Part *R = new Part[N];
//__declspec(align(64)) Part *P = new Part[N];
Part *R = new Part[N];
Part *P = new Part[N];
initData(R, N);
initData(P, N);
DKSBase dksbase;
dksbase.setAPI("OpenMP", 6);
dksbase.setDevice("-mic", 4);
dksbase.initDevice();
void *r_ptr, *p_ptr, *dt_ptr;
r_ptr = dksbase.allocateMemory<Part>(N, ierr);
p_ptr = dksbase.allocateMemory<Part>(N, ierr);
dt_ptr = dksbase.allocateMemory<double>(N, ierr);
dksbase.writeData<Part>(r_ptr, R, N);
cout << "====================START PUSH====================" << endl;
for (int i = 0; i < 5; i++) {
//write r to device
dksbase.writeData<Part>(r_ptr, R, N);
//calc push
dksbase.callParallelTTrackerPush (r_ptr, p_ptr, N, dt_ptr,
0.001, 1, false, NULL);
//read R from device
dksbase.readDataAsync<Part> (r_ptr, R, N, NULL);
}
cout << "====================END PUSH====================" << endl;
dksbase.freeMemory<Part>(r_ptr, N);
dksbase.freeMemory<Part>(p_ptr, N);
dksbase.freeMemory<double>(dt_ptr, N);
return 0;
}

89
test/testMPI.cpp Normal file
View File

@ -0,0 +1,89 @@
#include <iostream>
#include <mpi.h>
#include <string.h>
#include "DKSBase.h"
using namespace std;
void printData(int *data, int N, int nprocs, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < nprocs; i++) {
for (int j = 0; j < N; j++)
cout << data[i*N + j] << "\t";
cout << endl;
}
}
void initData(int *data, int N, int rank) {
for (int i = 0; i < N; i++)
data[i] = (rank+1);
}
int main(int argc, char *argv[]) {
int ierr;
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
int n = 8;
int sizen = sizeof(int)*n;
int sizeall = sizeof(int)*n*nprocs;
int *hdata_in = new int[n];
int *hdata_out = new int[n];
initData(hdata_in, n, rank);
cout << "In data for process " << rank+1 << ":\t";
printData(hdata_in, n, 1);
DKSBase base = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
if (rank == 0) {
int *hdata_out_all = new int[nprocs*n];
void* mem_ptr;
mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
MPI_Gather(hdata_in, n, MPI_INT, mem_ptr, n, MPI_INT, 0, MPI_COMM_WORLD);
base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
MPI_Scatter(mem_ptr, n, MPI_INT, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
base.freeMemory<int>(mem_ptr, n*nprocs);
printData(hdata_out_all, n, nprocs, "Out data 1:\n");
cout << "Scatter data for proces: " << rank + 1 << ": \t";
printData(hdata_in, n, 1);
} else {
MPI_Gather(hdata_in, n, MPI_INT, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
cout << "Scatter data for proces: " << rank + 1 << ": \t";
printData(hdata_in, n, 1);
}
MPI_Finalize();
return 0;
}

91
test/testMPIFFT.cpp Normal file
View File

@ -0,0 +1,91 @@
#include <iostream>
#include <mpi.h>
#include <string.h>
#include "DKSBase.h"
using namespace std;
void printData(complex<double> *data, int N, int nprocs, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < nprocs; i++) {
for (int j = 0; j < N; j++)
cout << data[i*N + j] << "\t";
cout << endl;
}
}
void initData(complex<double> *data, int N, int rank) {
for (int i = 0; i < N; i++)
data[i] = complex<double>((double)rank+1.0, 0.0);
}
int main(int argc, char *argv[]) {
int ierr;
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
int n = 8;
complex<double> *hdata_in = new complex<double>[n];
complex<double> *hdata_out = new complex<double>[n];
initData(hdata_in, n, rank);
cout << "In data for process " << rank+1 << ":\t";
printData(hdata_in, n, 1);
DKSBase base = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
if (rank == 0) {
complex<double> *hdata_out_all = new complex<double>[nprocs*n];
void* mem_ptr;
mem_ptr = base.allocateMemory< complex<double> >(nprocs*n, ierr);
MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, mem_ptr, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
int dimsize[3] = {n*nprocs, 1, 1};
base.callFFT(mem_ptr, 1, dimsize);
base.readData< complex<double> >(mem_ptr, hdata_out_all, n*nprocs);
MPI_Scatter(mem_ptr, n, MPI_DOUBLE_COMPLEX, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
base.freeMemory< complex<double> >(mem_ptr, n*nprocs);
printData(hdata_out_all, n, nprocs, "Out data 1:\n");
cout << "Scatter data for proces: " << rank + 1 << ": \t";
printData(hdata_out, n, 1);
} else {
MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
cout << "Scatter data for proces: " << rank + 1 << ": \t";
printData(hdata_out, n, 1);
}
MPI_Finalize();
return 0;
}

75
test/testMemObjects.cpp Normal file
View File

@ -0,0 +1,75 @@
#include <iostream>
#include <cstdlib>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
int ierr,n, N;
if (argc > 1)
n = atoi(argv[1]);
else
n = 10;
N = 2 << n;
cout << "Elements: " << N << endl;
double *data = new double[N];
for (int i = 0; i < N; i++)
data[i] = (double)i / N;
DKSBase base = DKSBase();
base.setAPI("OpenCL", 6);
base.setDevice("-gpu", 4);
base.initDevice();
void *ptr1;
ptr1 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr1, data, N);
void *ptr2;
ptr2 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr2, data, N);
void *ptr3;
ptr3 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr3, data, N);
void *ptr4;
ptr4 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr4, data, N);
void *ptr5;
ptr5 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr5, data, N);
void *ptr6;
ptr6 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr6, data, N);
void *ptr7;
ptr7 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr7, data, N);
void *ptr8;
ptr8 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr8, data, N);
base.freeMemory<double>(ptr1, N);
base.freeMemory<double>(ptr2, N);
base.freeMemory<double>(ptr3, N);
base.freeMemory<double>(ptr4, N);
base.freeMemory<double>(ptr5, N);
base.freeMemory<double>(ptr6, N);
base.freeMemory<double>(ptr7, N);
base.freeMemory<double>(ptr8, N);
return 0;
}

73
test/testOffset.cpp Normal file
View File

@ -0,0 +1,73 @@
#include <iostream>
#include <cstdlib>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else if (argc == 3) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
int ierr,n, N;
N = 8;
n = 4;
double *data_in = new double[N];
double *data_out_1 = new double[N];
double *data_out_2 = new double[N];
for (int i = 0; i < N; i++) {
data_in[i] = (double)i / N;
data_out_1[i] = 0.0;
data_out_2[i] = 0.0;
}
cout << "Run example on: " << api_name << " using " << device_name << endl;
DKSBase base = DKSBase();
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
void *ptr1;
ptr1 = base.allocateMemory<double>(N, ierr);
ierr = base.writeData<double>(ptr1, data_in, n, 0);
ierr = base.writeData<double>(ptr1, data_in, n, 4);
ierr = base.readData<double>(ptr1, data_out_1, N);
ierr = base.readData<double>(ptr1, data_out_2, n, 2);
base.freeMemory<double>(ptr1, N);
for (int i = 0; i < N; i++)
cout << data_in[i] << "\t";
cout << endl;
for (int i = 0; i < N; i++)
cout << data_out_1[i] << "\t";
cout << endl;
for (int i = 0; i < N; i++)
cout << data_out_2[i] << "\t";
cout << endl;
return 0;
}

81
test/testOffsetMPI.cpp Normal file
View File

@ -0,0 +1,81 @@
#include <mpi.h>
#include <iostream>
#include <cstdlib>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
cout << "Rank " << rank << " from " << size << endl;
int ierr, N, n;
N = 8;
n = N / 2;
double *data_in = new double[n];
for (int i = 0; i < n; i++)
data_in[i] = (double)rank + 1.0 + (double)i / n;
DKSBase base = DKSBase();
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
if (rank == 0) {
//alocate memory of size N
void *ptr1;
ptr1 = base.allocateMemory<double>(size*N, ierr);
cout << "Sent pointer: " << ptr1 << endl;
//send ptr to other processes
MPI_Send(&ptr1, sizeof(void*), MPI_BYTE, 1, 123, MPI_COMM_WORLD);
//wrtie n data with no offset to device and wait for other processes
ierr = base.writeData<double>(ptr1, data_in, n, rank*n);
MPI_Barrier(MPI_COMM_WORLD);
//read memory of size N from device
double *data_out = new double[N];
ierr = base.readData<double>(ptr1, data_out, N);
//free device memory
base.freeMemory<double>(ptr1, size*N);
//print results
for (int i = 0; i < n; i++)
cout << data_in[i] << "\t";
cout << endl;
for (int i = 0; i < N; i++)
cout << data_out[i] << "\t";
cout << endl;
} else {
//receive device memory pointer
void *ptr2;
MPI_Recv(&ptr2, sizeof(void*), MPI_BYTE, 0, 123, MPI_COMM_WORLD, NULL);
cout << "Received pointer: " << ptr2 << endl;
//write data with an offset
base.writeData<double>(ptr2, data_in, n, rank*n);
MPI_Barrier(MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}

57
test/testPush.cpp Normal file
View File

@ -0,0 +1,57 @@
#include <iostream>
#include <cstdlib>
#include <vector>
#include "DKSBase.h"
#include <vector_types.h>
#include "cuda_runtime.h"
using namespace std;
void initData(double3 *data, int N) {
for (int i = 0; i < N; i++) {
data[i].x = rand() / RAND_MAX;
data[i].y = rand() / RAND_MAX;
data[i].z = rand() / RAND_MAX;
}
}
int main() {
int ierr;
int N = 1000000;
double3 *R = new double3[N];
double3 *P = new double3[N];
initData(R, N);
initData(P, N);
DKSBase dksbase;
dksbase.setAPI("Cuda", 4);
dksbase.setDevice("-gpu", 4);
dksbase.initDevice();
void *r_ptr, *p_ptr;
r_ptr = dksbase.allocateMemory<double3>(N, ierr);
p_ptr = dksbase.allocateMemory<double3>(N, ierr);
dksbase.writeData<double3>(r_ptr, R, N);
dksbase.writeData<double3>(p_ptr, P, N);
for (int i = 0; i < 100; i++)
dksbase.callParallelTTrackerPush(r_ptr, p_ptr, N, NULL, 0.5, 1, false);
dksbase.readData<double3>(r_ptr, R, N);
dksbase.readData<double3>(p_ptr, P, N);
dksbase.freeMemory<double3>(r_ptr, N);
dksbase.freeMemory<double3>(p_ptr, N);
return 0;
}

168
test/testRCFFT.cpp Normal file
View File

@ -0,0 +1,168 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData(double* &data, int N1, int N2);
void printData(complex<double>* &data, int N1, int N2);
void printData3DN4(complex<double>* &data, int N, int dim);
void printData3DN4(double* &data, int N, int dim);
void compareData(double* &data1, double* &data2, int N, int dim);
int main(int argc, char *argv[]) {
int N1 = 4;
int N2 = 4;
if (argc == 3) {
N1 = atoi(argv[1]);
N2 = atoi(argv[2]);
}
int dimsize[3] = {N1, N2, 1};
cout << "Begin RC 3D FFT tests, grid = " << N1 << "\t" << N2 << endl;
int sizereal = N1*N2;
int sizecomp = N1*(N2/2+1);
int dim = 3;
double *cdata = new double[sizereal];
complex<double> *cfft = new complex<double>[sizecomp];
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1; j++) {
cdata[i*N1 + j] = (double)(j) / N1;
}
}
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
void *real_ptr, *comp_ptr;
int ierr;
/* allocate memory on device */
real_ptr = base.allocateMemory<double>(sizereal, ierr);
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
/* write data to device */
ierr = base.writeData<double>(real_ptr, cdata, sizereal);
/* execute fft */
base.callR2CFFT(real_ptr, comp_ptr, 2, dimsize);
/* read data from device */
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
/* free device memory */
base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
cout << "FFT complete" << endl;
/* print results */
printData(cdata, N1, N2);
printData(cfft, N1, N2);
return 0;
}
void printData(double* &data, int N1, int N2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1; j++) {
cout << data[i*N1 + j] << " ";
}
cout << endl;
}
cout << endl;
}
void printData(complex<double>* &data, int N1, int N2) {
complex<double> tmp(0.0, 0.0);
for (int i = 0; i < N2/2+1; i++) {
for (int j = 0; j < N1; j++) {
tmp = data[i*N1 + j];
if (tmp.real() < 0.00001 && tmp.real() > -0.00001) tmp = complex<double>(0.0, tmp.imag());
if (tmp.imag() < 0.00001 && tmp.imag() > -0.00001) tmp = complex<double>(tmp.real(), 0.0);
cout << tmp << " ";
}
cout << endl;
}
cout << endl;
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
}
}
cout << endl;
}
cout << endl;
}
void printData3DN4(double* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k];
if (d > 10e-5 || d < -10e-5)
cout << d << "\t";
else
cout << 0 << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(double* &data1, double* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id] - data2[id]);
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

181
test/testStockFFT3D.cpp Normal file
View File

@ -0,0 +1,181 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData3DN4(complex<double>* &data, int N, int dim);
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
int main(int argc, char *argv[]) {
int n = 2;
if (argc == 2)
n = atoi(argv[1]);
int N = pow(2,n);
cout << "Begin DKS Base tests" << endl;
cout << "FFT size: " << N << endl;
int dimsize[3] = {N, N, N};
complex<double> *cdata = new complex<double>[N*N*N];
complex<double> *cfft = new complex<double>[N*N*N];
complex<double> *cfft2 = new complex<double>[N*N*N];
complex<double> *cfft3 = new complex<double>[N*N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
//cdata[i*N*N + j*N + k] = complex<double>((double)k/(N*N*N), 0);
cdata[i*N*N + j*N + k] = complex<double>(k, 0);
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
cfft2[i*N*N + j*N + k] = complex<double>(0, 0);
cfft3[i*N + j*N + k] = complex<double>(0, 0);
}
}
}
if (N == 4)
printData3DN4(cdata, N, 3);
/* init DKSBase */
cout << "Init device and set function" << endl;
int ierr;
timestamp_t t0, t1;
/* stockham radix-2 out-of-place fft */
DKSBase base2;
base2.setAPI("OpenCL", 6);
base2.setDevice("-gpu", 4);
base2.initDevice();
cout << endl;
void *src_ptr;
for (int i = 0; i < 5; i++) {
t0 = get_timestamp();
src_ptr = base2.allocateMemory< complex<double> >(N*N*N, ierr);
base2.writeData< complex<double> >(src_ptr, cdata, N*N*N);
base2.callFFTStockham(src_ptr, 3, dimsize);
base2.readData< complex<double> >(src_ptr, cfft2, N*N*N);
base2.freeMemory< complex<double> >(src_ptr, N*N*N);
t1 = get_timestamp();
cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
}
if (N == 4)
printData3DN4(cfft2, N, 3);
//delete base2;
cout << endl;
/* CUDA cufft */
DKSBase base3;
base3.setAPI("Cuda", 4);
base3.setDevice("-gpu", 4);
base3.initDevice();
cout << endl;
void *cuda_ptr;
for (int i = 0; i < 5; i++) {
t0 = get_timestamp();
cuda_ptr = base3.allocateMemory< complex<double> >(N*N*N, ierr);
base3.writeData< complex<double> >(cuda_ptr, cdata, N*N*N);
base3.callFFT(cuda_ptr, 3, dimsize);
base3.readData< complex<double> >(cuda_ptr, cfft3, N*N*N);
base3.freeMemory< complex<double> >(cuda_ptr, N*N*N);
t1 = get_timestamp();
cout << "Cuda FFT time: " << get_secs(t0, t1) << endl;
}
if (N == 4)
printData3DN4(cfft3, N, 3);
//delete base3;
cout << endl;
/* radix-2 in place fft */
DKSBase base;
base.setAPI("OpenCL", 6);
base.setDevice("-gpu", 4);
base.initDevice();
cout << endl;
void *mem_ptr;
for (int i = 0; i < 5; i++) {
t0 = get_timestamp();
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
base.callFFT(mem_ptr, 3, dimsize);
base.readData< complex<double> >(mem_ptr, cfft, N*N*N);
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
t1 = get_timestamp();
cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
}
if (N == 4)
printData3DN4(cfft, N, 3);
//delete base;
cout << endl;
/* compare results */
cout << endl;
cout << "Radix 2 vs Stockham: ";
compareData(cfft, cfft2, N, 3);
cout << "Radix 2 vs Cufft: ";
compareData(cfft, cfft3, N, 3);
cout << "Stockham vs Cufft: ";
compareData(cfft2, cfft3, N, 3);
return 0;
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
if (d > 10e-5 || d < -10e-5)
cout << d << "\t";
else
cout << 0 << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "CC <--> CC diff: " << sum << endl;
}

107
test/testStockhamFFT.cpp Normal file
View File

@ -0,0 +1,107 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
int n = 2;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else if (argc == 3) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
} else if (argc == 4) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
n = atoi(argv[3]);
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
int N = pow(2,n);
cout << "Use api: " << api_name << endl;
cout << "Begin DKS Base tests" << endl;
cout << "FFT size: " << N << endl;
int dimsize[3] = {N, N, N};
complex<double> *cdata = new complex<double>[N];
complex<double> *cfft = new complex<double>[N];
complex<double> *cfft2 = new complex<double>[N];
complex<double> *cfftsrc = new complex<double>[N];
for (int i = 0; i < N; i++) {
cdata[i] = complex<double>((double)i / N, 0);
cfft[i] = complex<double>(0, 0);
cfft2[i] = complex<double>(0, 0);
cfftsrc[i] = complex<double>(0, 0);
}
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
timestamp_t t0, t1;
/* radix-2 in place fft */
void *mem_ptr;
int ierr;
for (int i = 0; i < 5; i++) {
t0 = get_timestamp();
mem_ptr = base.allocateMemory< complex<double> >(N, ierr);
base.writeData< complex<double> >(mem_ptr, cdata, N);
base.callFFT(mem_ptr, 1, dimsize);
base.readData< complex<double> >(mem_ptr, cfft, N);
base.freeMemory< complex<double> >(mem_ptr, N);
t1 = get_timestamp();
cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
}
cout << endl;
/* stockham radix-2 out-of-place fft */
void *src_ptr;
for (int i = 0; i < 5; i++) {
t0 = get_timestamp();
src_ptr = base.allocateMemory< complex<double> >(N, ierr);
base.writeData< complex<double> >(src_ptr, cdata, N);
base.callFFTStockham(src_ptr, 1, dimsize);
base.readData< complex<double> >(src_ptr, cfft2, N);
base.freeMemory< complex<double> >(src_ptr, N);
t1 = get_timestamp();
cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
}
double diff = 0;
for (int i = 0; i < N; i++) {
diff += fabs(cfft[i].real() - cfft2[i].real());
diff += fabs(cfft[i].imag() - cfft2[i].imag());
}
cout << endl << "Difference: " << diff << endl;
if (diff > 0.00001) {
for (int i = 0; i < 10; i++) {
cout << cfft[i] << "\t" << cfft2[i] << endl;
}
}
return 0;
}

View File

@ -0,0 +1,227 @@
#include <iostream>
#include <vector>
#include <time.h>
#include <sys/time.h>
#include "DKSBase.h"
#include <vector_types.h>
#include "cuda_runtime.h"
using namespace std;
typedef struct {
double x;
double y;
double z;
} Vector;
Vector initVector() {
Vector tmp;
tmp.x = 0.5;
tmp.y = 0.5;
tmp.z = 0.5;
return tmp;
}
void initVectors(Vector *v, int N) {
for (int i = 0; i < N; i++)
v[i] = initVector();
}
void initDouble(double *data, int N) {
for (int i = 0; i < N; i++)
data[i] = 0.005;
}
void initLastSect(long *data, int N) {
for (int i = 0; i < N; i++)
data[i] = -1;
}
void checkSum(Vector *v, int N) {
double sum = 0;
for (int i = 0; i < N; i++)
sum += v[i].x + v[i].y + v[i].z;
std::cout << "checksum: " << sum << std::endl;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 10;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of particles: " << numpart << endl;
cout << "------------------------------------------------------------" << endl;
//init p,r and dt arrays to test time integration
Vector *r = new Vector[numpart];
Vector *p = new Vector[numpart];
Vector *x = new Vector[numpart];
Vector *ori = new Vector[5];
initVectors(r, numpart);
initVectors(p, numpart);
initVectors(x, numpart);
initVectors(ori, 5);
double *dt = new double[numpart];
initDouble(dt, numpart);
long *ls = new long[numpart];
initLastSect(ls, numpart);
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
int stream1, stream2;
base.createStream(stream1);
base.createStream(stream2);
base.registerHostMemory(r, numpart);
base.registerHostMemory(p, numpart);
base.registerHostMemory(x, numpart);
base.registerHostMemory(dt, numpart);
base.registerHostMemory(ls, numpart);
//***test parallelttrackerpush***//
void *r_ptr, *p_ptr, *x_ptr, *dt_ptr, *ls_ptr, *ori_ptr;
//allocate memory on the device
r_ptr = base.allocateMemory<Vector>(numpart, ierr);
p_ptr = base.allocateMemory<Vector>(numpart, ierr);
x_ptr = base.allocateMemory<Vector>(numpart, ierr);
dt_ptr = base.allocateMemory<double>(numpart, ierr);
ls_ptr = base.allocateMemory<long>(numpart, ierr);
ori_ptr = base.allocateMemory<Vector>(5, ierr);
//transfer data to device
base.writeData<Vector>(r_ptr, r, numpart);
base.writeData<Vector>(p_ptr, p, numpart);
base.writeData<Vector>(x_ptr, x, numpart);
base.writeData<Vector>(ori_ptr, ori, 5);
//do some couple of integration loops before the timer is started
for (int i = 0; i < 5; i++) {
//calc push
base.callParallelTTrackerPush (r_ptr, p_ptr, numpart, dt_ptr,
0.05, 1, false, stream1);
//read R from device
base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
//write LastSection to device
base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
//calc push
base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
dt_ptr, 0.05, 1, false, stream2);
//read x from device
base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
//sync and wait till all tasks and reads are complete
base.syncDevice();
}
checkSum(r, numpart);
checkSum(x, numpart);
//start the timing of integration
struct timeval timeStart, timeEnd;
std::cout << "start integration" << std::endl;
gettimeofday(&timeStart, NULL);
for (int i = 0; i < loop; i++) {
//calc push
base.callParallelTTrackerPush(r_ptr, p_ptr, numpart, dt_ptr, 0.05, 1, false, stream1);
//read R from device
base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
//write LastSection to device
base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
//calc push transform
base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
dt_ptr, 0.05, 1, false, stream2);
//read R from device
base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
//sync and wait till all tasks and reads are complete
base.syncDevice();
}
gettimeofday(&timeEnd, NULL);
std::cout << "end integration" << std::endl;
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec));
std::cout << "Time for " << numpart << " integrations: " << t * 1e-6 << "s" << std::endl;
std::cout << "Average time for integration: " << t * 1e-6 / loop << std::endl;
checkSum(r, numpart);
checkSum(x, numpart);
//free memory
base.freeMemory<Vector>(r_ptr, numpart);
base.freeMemory<Vector>(p_ptr, numpart);
base.freeMemory<Vector>(x_ptr, numpart);
base.freeMemory<Vector>(ori_ptr, 5);
base.freeMemory<double>(dt_ptr, numpart);
base.freeMemory<long>(ls_ptr, numpart);
//unregister host memory
base.unregisterHostMemory(r);
base.unregisterHostMemory(p);
base.unregisterHostMemory(x);
base.unregisterHostMemory(dt);
base.unregisterHostMemory(ls);
//free host memory
delete[] r;
delete[] x;
delete[] p;
delete[] dt;
delete[] ls;
delete[] ori;
cout << "==========================END TEST==========================" << endl;
return 0;
}

76
test/testTranspose.cpp Normal file
View File

@ -0,0 +1,76 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void initData(complex<double> *d, int N, int dim) {
int size = N;
if (dim == 2) size = N*N;
if (dim == 3) size = N*N*N;
for (int i = 0; i < size; i++)
d[i] = complex<double>(i, 0);
}
void printData(complex<double> *d, int N, int dim) {
int NZ = N;
int NY = (dim > 1) ? N : 1;
int NX = (dim > 2) ? N : 1;
for (int i = 0; i < NX; i++) {
for (int j = 0; j < NY; j++) {
for (int k = 0; k < NZ; k++) {
std::cout << d[i*N*N + j*N + k].real() << "\t";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main(int argc, char *argv[]) {
int N = (argc > 1) ? atoi(argv[1]) : 4;
int dimN[3] = {N, N, 1};
int dim = 2;
int ndim = 1;
int size = dimN[0] * dimN[1] * dimN[2];
std::complex<double> *hd_in = new std::complex<double>[size];
std::complex<double> *hd_out = new std::complex<double>[size];
initData(hd_in, N, dim);
printData(hd_in, N, dim);
DKSBase base;
base.setAPI("OpenCL", 6);
base.setDevice("-gpu", 4);
base.initDevice();
int ierr;
void *mem_ptr;
mem_ptr = base.allocateMemory< std::complex<double> >(size, ierr);
base.writeData< std::complex<double> >(mem_ptr, hd_in, size);
base.callTranspose(mem_ptr, dimN, dim, ndim);
base.readData< std::complex<double> >(mem_ptr, hd_out, size);
base.freeMemory< std::complex<double> >(mem_ptr, size);
printData(hd_out, N, 2);
delete[] hd_in;
delete[] hd_out;
return 0;
}