snapshot of svn
This commit is contained in:
84
test/CMakeLists.txt
Normal file
84
test/CMakeLists.txt
Normal file
@ -0,0 +1,84 @@
|
||||
INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||
|
||||
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||
|
||||
#ADD_EXECUTABLE(testDKS testDKS.cpp)
|
||||
#ADD_EXECUTABLE(testChi testChi.cpp)
|
||||
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
||||
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
||||
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
||||
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
||||
#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp)
|
||||
#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp)
|
||||
#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp)
|
||||
#ADD_EXECUTABLE(testOffset testOffset.cpp)
|
||||
#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp)
|
||||
#ADD_EXECUTABLE(testMPI testMPI.cpp)
|
||||
#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp)
|
||||
#ADD_EXECUTABLE(testGather testGather.cpp)
|
||||
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
||||
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
||||
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
||||
#ADD_EXECUTABLE(testPush testPush.cpp)
|
||||
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
||||
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
||||
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
||||
|
||||
#shared library
|
||||
#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp)
|
||||
|
||||
|
||||
#TARGET_LINK_LIBRARIES(testDKS dks)
|
||||
#TARGET_LINK_LIBRARIES(testChi dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testMIC dks)
|
||||
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3D dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
||||
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testStockFFT3D dks)
|
||||
#TARGET_LINK_LIBRARIES(testMemObjects dks)
|
||||
#TARGET_LINK_LIBRARIES(testRCFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testOffset dks)
|
||||
#TARGET_LINK_LIBRARIES(testOffsetMPI dks)
|
||||
#TARGET_LINK_LIBRARIES(testMPI dks)
|
||||
#TARGET_LINK_LIBRARIES(testMPIFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testGather dks)
|
||||
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
||||
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
|
||||
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
|
||||
#TARGET_LINK_LIBRARIES(testPush dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
|
||||
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
||||
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
||||
|
||||
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared)
|
||||
|
||||
|
||||
#IF (${COMPILER_NAME} STREQUAL "mpicxx")
|
||||
#ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp)
|
||||
#ADD_EXECUTABLE(testGreens testGreens.cpp)
|
||||
#ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp)
|
||||
#ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp)
|
||||
#TARGET_LINK_LIBRARIES(testGatherAsync2 dks)
|
||||
#TARGET_LINK_LIBRARIES(testGreens dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFTSolver dks)
|
||||
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks)
|
||||
#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
|
||||
|
||||
#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp)
|
||||
#TARGET_LINK_LIBRARIES(testChiSquare dks)
|
||||
|
||||
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
|
||||
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
141
test/testChi.cpp
Normal file
141
test/testChi.cpp
Normal file
@ -0,0 +1,141 @@
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "DKSBase.h"
|
||||
#include "Utility/TimeStamp.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[4];
|
||||
|
||||
|
||||
if (argc == 3) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
} else if (argc == 2){
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << endl;
|
||||
|
||||
cout << "Begin DKS Base tests" << endl;
|
||||
|
||||
/* inti data */
|
||||
int ierr;
|
||||
int nsize = 4000000;
|
||||
int jsize = 16;
|
||||
int psize = 6;
|
||||
double *data = new double[nsize*jsize];
|
||||
double *p = new double[psize*jsize];
|
||||
double data_out = 0;
|
||||
|
||||
srand(time(NULL));
|
||||
for (int i = 0; i < nsize*jsize; i++) {
|
||||
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
|
||||
//data[i] = sign*(double)rand()/RAND_MAX;
|
||||
data[i] = (double)i / (nsize*jsize);
|
||||
//data[i] = 1;
|
||||
}
|
||||
for (int i = 0; i < psize*jsize; i++) {
|
||||
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
|
||||
//p[i] = sign*(double)rand()/RAND_MAX;
|
||||
p[i] = (double)i / (nsize*jsize);
|
||||
//p[i] = 1;
|
||||
}
|
||||
/* end init */
|
||||
|
||||
timestamp_t tstart, tend;
|
||||
//timestamp_t t0, t1;
|
||||
|
||||
tstart = get_timestamp();
|
||||
|
||||
//init dks base class, set API to opencl and init connection with OpenCL device
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(device_name));
|
||||
base.initDevice();
|
||||
|
||||
//ptrs to hold reference to device memory
|
||||
void *dptr, *ntptr, *pptr;
|
||||
|
||||
//allocate memory on device
|
||||
//t0 = get_timestamp();
|
||||
dptr = base.allocateMemory<double>(nsize*jsize, ierr);
|
||||
ntptr = base.allocateMemory<double>(nsize*jsize, ierr);
|
||||
pptr = base.allocateMemory<double>(psize*jsize, ierr);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Allocate memory: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//write data to device
|
||||
//t0 = get_timestamp();
|
||||
base.writeData<double>(dptr, data, nsize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Write data set: " << get_secs(t0, t1) << endl << endl;
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
//write parameters to device
|
||||
//t0 = get_timestamp();
|
||||
base.writeData<double>(pptr, p, psize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Write parameters: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//set function to calcNt and execute it with necessary parameters
|
||||
//t0 = get_timestamp();
|
||||
base.callNt<double>(ntptr, pptr, psize, nsize, jsize, 0.025);
|
||||
//t1 = get_timestamp();
|
||||
|
||||
//cout << "Calc N(t): " << get_secs(t0, t1) << endl;
|
||||
|
||||
//set function to chi2 and execute it with necessary parameters
|
||||
//t0 = get_timestamp();
|
||||
base.callChi2<double>(ntptr, dptr, ntptr, nsize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Calc chi^2: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//set function so sum and execute it with necessary parameters
|
||||
//t0 = get_timestamp();
|
||||
base.callSum<double>(ntptr, ntptr, nsize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Calc sum: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//read calculated sum (one value)
|
||||
//t0 = get_timestamp();
|
||||
base.readData<double>(ntptr, &data_out, 1);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Read sum: " << get_secs(t0, t1) << endl;
|
||||
cout << "Sum nt: " << data_out << endl;
|
||||
|
||||
/*
|
||||
for (int i = 0; i < psize*jsize; i++) {
|
||||
int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
|
||||
p[i] = sign*(double)rand()/RAND_MAX;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
//cout << endl;
|
||||
}
|
||||
|
||||
//free device memory
|
||||
//t0 = get_timestamp();
|
||||
base.freeMemory<double>(dptr, nsize*jsize);
|
||||
base.freeMemory<double>(ntptr, nsize*jsize);
|
||||
base.freeMemory<double>(pptr, psize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Free memory: " << get_secs(t0, t1) << endl;
|
||||
|
||||
tend = get_timestamp();
|
||||
|
||||
cout << endl << "time: " << get_secs(tstart, tend) << endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
168
test/testChiSquare.cpp
Normal file
168
test/testChiSquare.cpp
Normal file
@ -0,0 +1,168 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void initData(vector< vector<double> > &v, int length) {
|
||||
|
||||
for (unsigned int i = 0; i < v.size(); i++) {
|
||||
for (int j = 0; j < length; j++) {
|
||||
v[i].push_back(j);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void printData(vector< vector<double> > &v) {
|
||||
for (unsigned int i = 0; i < v.size(); i++) {
|
||||
for (unsigned int j = 0; j < v[i].size(); j++) {
|
||||
cout << v[i][j] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initData(double *data, int sensors, int length) {
|
||||
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
for (int j = 0; j < length; j++) {
|
||||
data[i*length + j] = j;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void printData(double *data, int sensors, int length) {
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
for (int j = 0; j < length; j++) {
|
||||
cout << data[i*length + j] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initPar(double *par, int npar) {
|
||||
|
||||
for (int i = 0; i < npar; i++)
|
||||
par[i] = (double)i / npar;
|
||||
|
||||
}
|
||||
|
||||
void printDiv(int size) {
|
||||
for (int i = 0; i < size; i++)
|
||||
cout << "=";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void calcChisq(vector< vector<double> > fData, double * par, double fTimeResolution, double fRebin)
|
||||
{
|
||||
|
||||
double chisq = 0.0;
|
||||
double theo, data;
|
||||
const double tau=2.197019;
|
||||
const double dt0 = fTimeResolution*0.5*(fRebin-1);
|
||||
double time;
|
||||
double w = par[0]*0.08516155035269027;
|
||||
|
||||
unsigned int i, j;
|
||||
|
||||
for (i=0; i<fData.size(); i++) {
|
||||
for (j=0; j<fData[0].size(); j++) {
|
||||
data = fData[i][j];
|
||||
time = dt0+fTimeResolution*fRebin*j;
|
||||
|
||||
theo = par[2 + i*4] * exp(-time/tau)*(1.0 + par[3 + i*4]*exp(-0.5 * pow(par[1]*time,2.0))*cos(w*time+par[4+i*4]*1.74532925199432955e-2))+par[5+i*4];
|
||||
if (data != 0.0) {
|
||||
chisq += (theo-data)*(theo-data)/data;
|
||||
cout << (theo-data)*(theo-data)/data << "\t";
|
||||
} else {
|
||||
chisq += theo*theo;
|
||||
cout << theo*theo << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
cout << "Chisq: " << chisq << endl;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
bool useCuda = true;
|
||||
if (argc == 2 && atoi(argv[1]) == 1)
|
||||
useCuda = false;
|
||||
|
||||
int ierr;
|
||||
int sensors = 5;
|
||||
int length = 10;
|
||||
int npar = 4 * sensors + 2;
|
||||
int ndata = sensors * length;
|
||||
|
||||
double result;
|
||||
|
||||
double fTimeResolution = 0.05;
|
||||
double fRebin = 5;
|
||||
|
||||
double *par = new double[npar];
|
||||
initPar(par, npar);
|
||||
|
||||
vector< vector< double > > fData;
|
||||
fData.resize(sensors);
|
||||
initData(fData, length);
|
||||
printData(fData);
|
||||
printDiv(75);
|
||||
|
||||
DKSBase dksbase;
|
||||
if (useCuda)
|
||||
dksbase.setAPI("Cuda", 4);
|
||||
else
|
||||
dksbase.setAPI("OpenCL", 6);
|
||||
dksbase.setDevice("-gpu", 4);
|
||||
dksbase.initDevice();
|
||||
dksbase.setupFFT(0, NULL);
|
||||
|
||||
|
||||
void *mem_data, *mem_par, *mem_chisq;
|
||||
cout << "Allocate memory" << endl;
|
||||
mem_par = dksbase.allocateMemory<double>(npar, ierr);
|
||||
mem_data = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
|
||||
mem_chisq = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
|
||||
|
||||
|
||||
cout << "Write data" << endl;
|
||||
dksbase.writeData<double>(mem_par, par, npar);
|
||||
for (int i = 0; i < sensors; i++)
|
||||
dksbase.writeData<double>(mem_data, &fData[i][0], length, i*length);
|
||||
|
||||
|
||||
|
||||
cout << "Call PHistoTFFcn" << endl;
|
||||
dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||
fTimeResolution, fRebin,
|
||||
sensors, length, npar, result);
|
||||
cout << "Result: " << result << endl;
|
||||
|
||||
|
||||
double *out_data = new double[ndata];
|
||||
dksbase.readData<double>(mem_chisq, out_data, ndata);
|
||||
printDiv(75);
|
||||
printData(out_data, sensors, length);
|
||||
printDiv(75);
|
||||
|
||||
calcChisq(fData, par, fTimeResolution, fRebin);
|
||||
printDiv(75);
|
||||
|
||||
cout << "Free memory" << endl;
|
||||
dksbase.freeMemory<double>(mem_par, npar);
|
||||
dksbase.freeMemory<double>(mem_data, ndata);
|
||||
dksbase.freeMemory<double>(mem_chisq, ndata);
|
||||
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
193
test/testChiSquareRT.cpp
Normal file
193
test/testChiSquareRT.cpp
Normal file
@ -0,0 +1,193 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <omp.h>
|
||||
|
||||
#include "DKSBaseMuSR.h"
|
||||
#include "Utility/DKSTimer.h"
|
||||
|
||||
void initData(double *data, int N, bool ones = false) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (ones)
|
||||
data[i] = 1.0;
|
||||
else
|
||||
data[i] = (double)rand() / RAND_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void printData(T *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
std::cout << data[i] << "\t";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
|
||||
const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])";
|
||||
//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])";
|
||||
//const std::string funct = "p[m[0]] * se(t, p[m[1]])";
|
||||
//const std::string funct = "p[m[1]] + p[m[0]]";
|
||||
|
||||
double fTheory(double time, double *par, double *func, int *map) {
|
||||
return cos(time*par[0]) - exp(-time*par[map[0]]);
|
||||
}
|
||||
|
||||
double testFunctionSerial(double *data, double *par, double *func, int *map,
|
||||
double N0, double tau, double bkg, double timeStep,
|
||||
int startTimeBin, int endTimeBin)
|
||||
{
|
||||
double time, diff, theo;
|
||||
double chisq = 0;
|
||||
for (int i = startTimeBin; i < endTimeBin; ++i) {
|
||||
time = i * timeStep;
|
||||
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
|
||||
diff = data[i] - theo;
|
||||
|
||||
chisq += diff * diff / data[i];
|
||||
}
|
||||
|
||||
return chisq;
|
||||
}
|
||||
|
||||
double testFunctionParallel(double *data, double *par, double *func, int *map,
|
||||
double N0, double tau, double bkg, double timeStep,
|
||||
int startTimeBin, int endTimeBin)
|
||||
{
|
||||
int i, chunk;
|
||||
double time, diff, theo;
|
||||
double chisq = 0;
|
||||
|
||||
chunk = (endTimeBin - startTimeBin) / omp_get_num_procs();
|
||||
if (chunk < 10)
|
||||
chunk = 10;
|
||||
#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq)
|
||||
for (i = startTimeBin; i < endTimeBin; ++i) {
|
||||
time = i * timeStep;
|
||||
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
|
||||
diff = data[i] - theo;
|
||||
|
||||
chisq += diff * diff / data[i];
|
||||
}
|
||||
|
||||
return chisq;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int Loop = 100;
|
||||
|
||||
//init test data on the host
|
||||
int Ndata = 8;
|
||||
if (argc > 1)
|
||||
Ndata = atoi(argv[1]);
|
||||
|
||||
int api = 1;
|
||||
if (argc > 2)
|
||||
api = atoi(argv[2]);
|
||||
|
||||
int Npar = 66;
|
||||
int Nfunc = 1;
|
||||
int Nmap = 4;
|
||||
|
||||
double *data = new double[Ndata];
|
||||
double *par = new double[Npar];
|
||||
double *func = new double[Nfunc];
|
||||
int *map = new int[Nmap];
|
||||
|
||||
initData(data, Ndata);
|
||||
initData(par, Npar);
|
||||
initData(func, Nfunc);
|
||||
map[0] = 1;
|
||||
map[1] = 2;
|
||||
map[2] = 3;
|
||||
map[3] = 4;
|
||||
|
||||
//create timers
|
||||
DKSTimer serialTimer;
|
||||
DKSTimer cudaTimer;
|
||||
DKSTimer ompTimer;
|
||||
DKSTimer gpuOverhead;
|
||||
serialTimer.init("Serial timer");
|
||||
cudaTimer.init("Cuda timer");
|
||||
ompTimer.init("OpenMP timer");
|
||||
gpuOverhead.init("Overhead for gpu");
|
||||
|
||||
|
||||
//serial version
|
||||
double resultSerial;
|
||||
|
||||
serialTimer.start();
|
||||
for (int i = 0; i < Loop; i++)
|
||||
resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
|
||||
serialTimer.stop();
|
||||
|
||||
//openmp version
|
||||
double resultOMP = 0.0;
|
||||
|
||||
ompTimer.start();
|
||||
//for (int i = 0; i < Loop; i++)
|
||||
// resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
|
||||
ompTimer.stop();
|
||||
|
||||
|
||||
//create and init dkabase
|
||||
gpuOverhead.start();
|
||||
|
||||
DKSBaseMuSR dksbase;
|
||||
if (api == 1)
|
||||
dksbase.setAPI("Cuda");
|
||||
else
|
||||
dksbase.setAPI("OpenCL");
|
||||
|
||||
dksbase.setDevice("-gpu");
|
||||
dksbase.initDevice();
|
||||
dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap);
|
||||
|
||||
//allocate memory on the device
|
||||
int ierr;
|
||||
void *data_ptr;
|
||||
|
||||
data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
|
||||
|
||||
dksbase.writeData<double>(data_ptr, data, Ndata);
|
||||
dksbase.writeFunctions(func, Nfunc);
|
||||
dksbase.writeMaps(map, Nmap);
|
||||
|
||||
dksbase.callCompileProgram(funct);
|
||||
gpuOverhead.stop();
|
||||
|
||||
double resultCuda;
|
||||
|
||||
cudaTimer.start();
|
||||
for (int i = 0; i < Loop; i++) {
|
||||
dksbase.writeParams(par, Npar);
|
||||
int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap,
|
||||
0.0, 0.1, 0, resultCuda);
|
||||
|
||||
if (ierr != 0)
|
||||
exit (EXIT_FAILURE);
|
||||
|
||||
}
|
||||
cudaTimer.stop();
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "=======================Results=======================" << std::endl;
|
||||
std::cout << "Result serial = " << resultSerial << std::endl;
|
||||
std::cout << "Result prallel = " << resultOMP << std::endl;
|
||||
std::cout << "Result cuda = " << resultCuda << std::endl;
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "=======================Timings=======================" << std::endl;
|
||||
serialTimer.print();
|
||||
ompTimer.print();
|
||||
cudaTimer.print();
|
||||
gpuOverhead.print();
|
||||
std::cout << std::endl;
|
||||
|
||||
dksbase.freeMemory<double>(data_ptr, Ndata);
|
||||
|
||||
return 0;
|
||||
|
||||
|
||||
}
|
248
test/testCollimatorPhysics.cpp
Normal file
248
test/testCollimatorPhysics.cpp
Normal file
@ -0,0 +1,248 @@
|
||||
#include <iostream>
|
||||
|
||||
#include <vector>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include <vector_types.h>
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
double Rincol[3];
|
||||
double Pincol[3];
|
||||
} PART_SMALL;
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double z;
|
||||
} Vector;
|
||||
|
||||
PART_SMALL initPartSmall(int d) {
|
||||
|
||||
PART_SMALL p;
|
||||
p.label = 0;
|
||||
p.localID = d;
|
||||
|
||||
p.Rincol[0] = 0.0;
|
||||
p.Rincol[1] = 0.0;
|
||||
p.Rincol[2] = 0.02;
|
||||
|
||||
p.Pincol[0] = 0.0;
|
||||
p.Pincol[1] = 0.0;
|
||||
p.Pincol[2] = 3.9920183237269791e-01;
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
Vector initVector() {
|
||||
Vector tmp;
|
||||
tmp.x = 0.5;
|
||||
tmp.y = 0.5;
|
||||
tmp.z = 0.5;
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
void printPart(PART_SMALL p) {
|
||||
cout << "label: " << p.label << ", ";
|
||||
cout << "localid: " << p.localID << ",";
|
||||
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
|
||||
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void printVector(Vector v) {
|
||||
cout << v.x << "\t" << v.y << "\t" << v.z << endl;
|
||||
}
|
||||
|
||||
void initParts(PART_SMALL *p, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
p[i] = initPartSmall(i);
|
||||
}
|
||||
|
||||
void printParts(PART_SMALL *p, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
printPart(p[i]);
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void initVectors(Vector *v, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
v[i] = initVector();
|
||||
}
|
||||
|
||||
void printVectors(Vector *v, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
printVector(v[i]);
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
|
||||
void initParams(double *data) {
|
||||
data[0] = 0.0;//2.0000000000000000e-02;
|
||||
data[1] = 1.0;//1.0000000000000000e-02;
|
||||
data[2] = 2.2100000000000000e+00;
|
||||
data[3] = 6.0000000000000000e+00;
|
||||
data[4] = 1.2010700000000000e+01;
|
||||
data[5] = 2.6010000000000000e+00;
|
||||
data[6] = 1.7010000000000000e+03;
|
||||
data[7] = 1.2790000000000000e+03;
|
||||
data[8] = 1.6379999999999999e-02;
|
||||
data[9] = 1.9321266968325795e-01;
|
||||
data[10] = 7.9000000000000000e+01;
|
||||
data[11] = 1.0000000000000002e-12;
|
||||
|
||||
}
|
||||
|
||||
void printDouble(double *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
std::cout << data[i] << "\t";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int loop = 10;
|
||||
int numpart = 1e5;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenMP");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-npart")) {
|
||||
numpart = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (argv[i] == string("-loop")) {
|
||||
loop = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cout << "=========================BEGIN TEST=========================" << endl;
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
cout << "Number of particles: " << numpart << endl;
|
||||
cout << "Number of loops: " << loop << endl;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
|
||||
//init part vector to test mc
|
||||
PART_SMALL *parts = new PART_SMALL[numpart];
|
||||
initParts(parts, numpart);
|
||||
|
||||
double *params = new double[12];
|
||||
initParams(params);
|
||||
|
||||
//init dks
|
||||
int ierr;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
//init random
|
||||
base.callInitRandoms(numpart);
|
||||
|
||||
//**test collimator physics and sort***//
|
||||
void *part_ptr, *param_ptr;
|
||||
|
||||
//allocate memory for particles
|
||||
part_ptr = base.allocateMemory<PART_SMALL>(numpart, ierr);
|
||||
param_ptr = base.allocateMemory<double>(12, ierr);
|
||||
|
||||
//transfer data to device
|
||||
base.writeData<PART_SMALL>(part_ptr, parts, numpart);
|
||||
base.writeData<double>(param_ptr, params, 12);
|
||||
|
||||
int numaddback;
|
||||
//test calls to do some first executions
|
||||
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
//std::cout << "particles to add back: " << numaddback << std::endl;
|
||||
|
||||
struct timeval timeStart, timeEnd;
|
||||
std::cout << "Start MC" << std::endl;
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i = 0; i < loop; i++) {
|
||||
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
|
||||
std::cout << "addback: " << numaddback << std::endl;
|
||||
|
||||
std::cout << "End MC" << std::endl;
|
||||
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec));
|
||||
|
||||
std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl;
|
||||
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
|
||||
|
||||
//read data from device
|
||||
base.readData<PART_SMALL>(part_ptr, parts, numpart);
|
||||
|
||||
//free memory
|
||||
base.freeMemory<PART_SMALL>(part_ptr, numpart);
|
||||
base.freeMemory<double>(param_ptr, 12);
|
||||
|
||||
|
||||
std::cout << std::fixed << std::setprecision(4);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
std::cout << parts[i].label << "\t"
|
||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
std:: cout << "..." << std::endl;
|
||||
|
||||
for (int i = numpart - 10; i < numpart; i++) {
|
||||
std::cout << parts[i].label << "\t"
|
||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
double arx = 0, ary = 0, arz = 0;
|
||||
double apx = 0, apy = 0, apz = 0;
|
||||
for (int i = 0; i < numpart; i++) {
|
||||
|
||||
arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart;
|
||||
ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart;
|
||||
arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart;
|
||||
|
||||
apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart;
|
||||
apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart;
|
||||
apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart;
|
||||
|
||||
}
|
||||
|
||||
std::cout << std::fixed << std::setprecision(10);
|
||||
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
|
||||
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
|
||||
|
||||
|
||||
cout << "==========================END TEST==========================" << endl;
|
||||
return 0;
|
||||
|
||||
}
|
126
test/testCollimatorPhysicsMPI.cpp
Normal file
126
test/testCollimatorPhysicsMPI.cpp
Normal file
@ -0,0 +1,126 @@
|
||||
#include <iostream>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "DKSBase.h"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
double Rincol[3];
|
||||
double Pincol[3];
|
||||
long IDincol;
|
||||
int Binincol;
|
||||
double DTincol;
|
||||
double Qincol;
|
||||
long LastSecincol;
|
||||
double Bfincol[3];
|
||||
double Efincol[3];
|
||||
} PART;
|
||||
|
||||
PART initPart(int d) {
|
||||
|
||||
PART p;
|
||||
p.label = d;
|
||||
p.localID = d;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
p.Rincol[i] = 0.5;// / (d+1);
|
||||
p.Pincol[i] = 0.5;// / (d+1);
|
||||
p.Bfincol[i] = 1.0 / (d+1);
|
||||
p.Efincol[i] = 1.0 / (d+1);
|
||||
}
|
||||
p.IDincol = d;
|
||||
p.Binincol = d;
|
||||
p.DTincol = d;
|
||||
p.Qincol = d;
|
||||
p.LastSecincol = d;
|
||||
|
||||
return p;
|
||||
|
||||
}
|
||||
|
||||
void printPart(PART p) {
|
||||
|
||||
cout << "label: " << p.label << ", ";
|
||||
//cout << "localID: " << p.localID << ", ";
|
||||
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
|
||||
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", ";
|
||||
//cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", ";
|
||||
//cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", ";
|
||||
//cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", ";
|
||||
//cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl;
|
||||
cout << endl;
|
||||
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
int rank, nprocs;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
int numpart = 500501;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
base.callInitRandoms(numpart);
|
||||
|
||||
PART tmp;
|
||||
vector<PART> p;
|
||||
vector<PART> p_out;
|
||||
p_out.resize(numpart);
|
||||
|
||||
for (int i = 0; i < numpart; i++) {
|
||||
tmp = initPart(i + 1);
|
||||
p.push_back(tmp);
|
||||
}
|
||||
|
||||
if (numpart <= 20) {
|
||||
for (int i = 0; i < 10; i++)
|
||||
printPart(p[i]);
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
double params[19];
|
||||
for (int i = 0; i < 19; i++)
|
||||
params[i] = 0.05;
|
||||
params[0] = 0;
|
||||
params[1] = 1;
|
||||
|
||||
void *mem_ptr, *par_ptr;
|
||||
|
||||
par_ptr = base.allocateMemory<double>(19, ierr);
|
||||
base.writeData<double>(par_ptr, params, 19);
|
||||
|
||||
mem_ptr = base.allocateMemory<PART>(numpart, ierr);
|
||||
base.writeData<PART>(mem_ptr, &p[0], numpart);
|
||||
|
||||
int addback, dead;
|
||||
for (int i = 0; i < 100; i++)
|
||||
base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead);
|
||||
cout << "Add back: " << addback << ", dead: " << dead << endl;
|
||||
|
||||
base.readData<PART>(mem_ptr, &p_out[0], numpart);
|
||||
base.freeMemory<PART>(mem_ptr, ierr);
|
||||
base.freeMemory<double>(par_ptr, ierr);
|
||||
|
||||
if (numpart <= 20) {
|
||||
for (int i = 0; i < numpart; i++)
|
||||
printPart(p_out[i]);
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
|
||||
}
|
250
test/testCollimatorPhysicsSoA.cpp
Normal file
250
test/testCollimatorPhysicsSoA.cpp
Normal file
@ -0,0 +1,250 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#include <vector>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include <vector_types.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
int *label;
|
||||
unsigned *localID;
|
||||
double *rx;
|
||||
double *ry;
|
||||
double *rz;
|
||||
double *px;
|
||||
double *py;
|
||||
double *pz;
|
||||
} PART;
|
||||
|
||||
|
||||
void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz,
|
||||
double *px, double *py, double *pz, int npart) {
|
||||
|
||||
for (int i = 0; i < npart; i++) {
|
||||
label[i] = 0;
|
||||
localID[i] = i;
|
||||
rx[i] = 0.0;
|
||||
ry[i] = 0.0;
|
||||
rz[i] = 0.02;
|
||||
px[i] = 0.0;
|
||||
py[i] = 0.0;
|
||||
pz[i] = 3.9920183237269791e-01;
|
||||
}
|
||||
}
|
||||
|
||||
void initParams(double *data) {
|
||||
data[0] = 0.0;//2.0000000000000000e-02;
|
||||
data[1] = 1.0;//1.0000000000000000e-02;
|
||||
data[2] = 2.2100000000000000e+00;
|
||||
data[3] = 6.0000000000000000e+00;
|
||||
data[4] = 1.2010700000000000e+01;
|
||||
data[5] = 2.6010000000000000e+00;
|
||||
data[6] = 1.7010000000000000e+03;
|
||||
data[7] = 1.2790000000000000e+03;
|
||||
data[8] = 1.6379999999999999e-02;
|
||||
data[9] = 1.9321266968325795e-01;
|
||||
data[10] = 7.9000000000000000e+01;
|
||||
data[11] = 1.0000000000000002e-12;
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int loop = 10;
|
||||
int numpart = 1e5;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenMP");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-npart")) {
|
||||
numpart = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (argv[i] == string("-loop")) {
|
||||
loop = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int threads = 0;
|
||||
/*
|
||||
#pragma offload target(mic:0) out(threads)
|
||||
{
|
||||
#pragma omp parallel
|
||||
{
|
||||
threads = omp_get_num_threads();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
cout << "=========================BEGIN TEST=========================" << endl;
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
cout << "Number of particles: " << numpart << endl;
|
||||
cout << "Number of loops: " << loop << endl;
|
||||
cout << "Number of threads: " << threads << endl;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
|
||||
//init part vector to test mc
|
||||
//int *label;
|
||||
//unsigned *localID;
|
||||
//double *rx, *ry, *rz, *px, *py, *pz;
|
||||
PART p;
|
||||
p.label = (int*) _mm_malloc(sizeof(int)*numpart, 64);
|
||||
p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64);
|
||||
p.rx = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.ry = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.rz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.px = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.py = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.pz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart);
|
||||
|
||||
double *params = new double[12];
|
||||
initParams(params);
|
||||
|
||||
//init dks
|
||||
int ierr;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
//init random
|
||||
base.callInitRandoms(numpart);
|
||||
|
||||
//**test collimator physics and sort***//
|
||||
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
|
||||
|
||||
//allocate memory for particles
|
||||
label_ptr = base.allocateMemory<int>(numpart, ierr);
|
||||
localID_ptr = base.allocateMemory<unsigned>(numpart, ierr);
|
||||
rx_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
ry_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
rz_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
px_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
py_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
pz_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
|
||||
param_ptr = base.allocateMemory<double>(12, ierr);
|
||||
|
||||
//transfer data to device
|
||||
base.writeData<int>(label_ptr, p.label, numpart);
|
||||
base.writeData<unsigned>(localID_ptr, p.localID, numpart);
|
||||
base.writeData<double>(rx_ptr, p.rx, numpart);
|
||||
base.writeData<double>(ry_ptr, p.ry, numpart);
|
||||
base.writeData<double>(rz_ptr, p.rz, numpart);
|
||||
base.writeData<double>(px_ptr, p.px, numpart);
|
||||
base.writeData<double>(py_ptr, p.py, numpart);
|
||||
base.writeData<double>(pz_ptr, p.pz, numpart);
|
||||
|
||||
//transfer params to device
|
||||
base.writeData<double>(param_ptr, params, 12);
|
||||
|
||||
std::cout << "test runs" << std::endl;
|
||||
|
||||
int numaddback;
|
||||
//test calls to do some first executions
|
||||
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
|
||||
struct timeval timeStart, timeEnd;
|
||||
std::cout << "Start MC" << std::endl;
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i = 0; i < loop; i++) {
|
||||
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
|
||||
std::cout << "addback: " << numaddback << std::endl;
|
||||
|
||||
std::cout << "End MC" << std::endl;
|
||||
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec));
|
||||
|
||||
std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl;
|
||||
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
|
||||
|
||||
//read data from device
|
||||
base.readData<int>(label_ptr, p.label, numpart);
|
||||
base.readData<unsigned>(localID_ptr, p.localID, numpart);
|
||||
base.readData<double>(rx_ptr, p.rx, numpart);
|
||||
base.readData<double>(ry_ptr, p.ry, numpart);
|
||||
base.readData<double>(rz_ptr, p.rz, numpart);
|
||||
base.readData<double>(px_ptr, p.px, numpart);
|
||||
base.readData<double>(py_ptr, p.py, numpart);
|
||||
base.readData<double>(pz_ptr, p.pz, numpart);
|
||||
|
||||
//free memory
|
||||
base.freeMemory<int>(label_ptr, numpart);
|
||||
base.freeMemory<unsigned>(localID_ptr, numpart);
|
||||
base.freeMemory<double>(rx_ptr, numpart);
|
||||
base.freeMemory<double>(ry_ptr, numpart);
|
||||
base.freeMemory<double>(rz_ptr, numpart);
|
||||
base.freeMemory<double>(px_ptr, numpart);
|
||||
base.freeMemory<double>(py_ptr, numpart);
|
||||
base.freeMemory<double>(pz_ptr, numpart);
|
||||
|
||||
base.freeMemory<double>(param_ptr, 12);
|
||||
|
||||
/*
|
||||
std::cout << std::fixed << std::setprecision(4);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
std::cout << p.label[i] << "\t" << p.rx[i]
|
||||
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
|
||||
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
|
||||
}
|
||||
std:: cout << "..." << std::endl;
|
||||
|
||||
for (int i = numpart - 10; i < numpart; i++) {
|
||||
std::cout << p.label[i] << "\t" << p.rx[i]
|
||||
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
|
||||
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
|
||||
}
|
||||
|
||||
double arx = 0, ary = 0, arz = 0;
|
||||
double apx = 0, apy = 0, apz = 0;
|
||||
for (int i = 0; i < numpart; i++) {
|
||||
|
||||
arx += sqrt(p.rx[i] * p.rx[i]) / numpart;
|
||||
ary += sqrt(p.ry[i] * p.ry[i]) / numpart;
|
||||
arz += sqrt(p.rz[i] * p.rz[i]) / numpart;
|
||||
|
||||
apx += sqrt(p.px[i] * p.px[i]) / numpart;
|
||||
apy += sqrt(p.py[i] * p.py[i]) / numpart;
|
||||
apz += sqrt(p.pz[i] * p.pz[i]) / numpart;
|
||||
|
||||
}
|
||||
|
||||
std::cout << std::fixed << std::setprecision(10);
|
||||
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
|
||||
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
|
||||
*/
|
||||
cout << "==========================END TEST==========================" << endl;
|
||||
return 0;
|
||||
|
||||
}
|
15
test/testDKS.cpp
Normal file
15
test/testDKS.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.getDevices();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
83
test/testFFT.cpp
Normal file
83
test/testFFT.cpp
Normal file
@ -0,0 +1,83 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
|
||||
cout << "Begin DKS Base tests" << endl;
|
||||
|
||||
int N = 2;
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
complex<double> *cdata = new complex<double>[N];
|
||||
complex<double> *cfft = new complex<double>[N];
|
||||
for (int i = 0; i < N; i++) {
|
||||
cdata[i] = complex<double>(0, 0);
|
||||
cfft[i] = complex<double>(0, 0);
|
||||
}
|
||||
|
||||
cdata[0] = complex<double>(1.73205, 1.73205);
|
||||
|
||||
timestamp_t t0, t1;
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* write data to device */
|
||||
mem_ptr = base.pushData< complex<double> >( (const void*)cdata, N, ierr);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 1, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 1, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 1, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.pullData< complex<double> >(mem_ptr, cfft, N);
|
||||
|
||||
/* print results */
|
||||
|
||||
cout << "Data" << endl;
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << cdata[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
cout << "FFT" << endl;
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << cfft[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
159
test/testFFT3D.cpp
Normal file
159
test/testFFT3D.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
|
||||
/* usage - ./testFFT3D */
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 16;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 4) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, argv[3]);
|
||||
} else {
|
||||
N = 16;
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
cout << "Begin DKS Base tests, N = " << N << endl;
|
||||
|
||||
int dim = 3;
|
||||
complex<double> *cdata = new complex<double>[N*N*N];
|
||||
complex<double> *cfft = new complex<double>[N*N*N];
|
||||
complex<double> *cifft = new complex<double>[N*N*N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
|
||||
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(device_name));
|
||||
base.initDevice();
|
||||
base.setupFFT(3, dimsize);
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* allocate memory on device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
|
||||
/* write data to device */
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||
|
||||
/* compare results */
|
||||
compareData(cdata, cifft, N, dim);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize) {
|
||||
int ni, nj, nk;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
if (!normalize) {
|
||||
cout << data[i*ni*ni + j*nj + k].real() << " ";
|
||||
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
|
||||
} else
|
||||
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
if (a < 10e-5 && a > -10e-5)
|
||||
a = 0;
|
||||
|
||||
cout << d << "; " << a << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
199
test/testFFT3DRC.cpp
Normal file
199
test/testFFT3DRC.cpp
Normal file
@ -0,0 +1,199 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
||||
void initData(double *data, int dimsize[3]);
|
||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
|
||||
void printHelp();
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N1 = 8;
|
||||
int N2 = 8;
|
||||
int N3 = 8;
|
||||
int dim = 3;
|
||||
int loop = 10;
|
||||
|
||||
if ( readParams(argc, argv, N1, N2, N3, loop) )
|
||||
return 0;
|
||||
|
||||
int dimsize[3] = {N3, N2, N1};
|
||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
||||
|
||||
double *rdata = new double[sizereal];
|
||||
double *outdata = new double[sizereal];
|
||||
complex<double> *cfft = new complex<double>[sizecomp];
|
||||
|
||||
for (int i=0; i<sizecomp; ++i) {
|
||||
cfft[i].real() = 7.;
|
||||
cfft[i].imag() = 3.33;
|
||||
}
|
||||
initData(rdata, dimsize);
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
#ifdef DKS_MIC
|
||||
DKSBase base;
|
||||
base.setAPI("OpenMP", 6);
|
||||
base.setDevice("-mic", 4);
|
||||
base.initDevice();
|
||||
base.setupFFTRC(dim, dimsize);
|
||||
/* setup backward fft (COMPLEX->REAL) */
|
||||
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
|
||||
#endif
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
DKSBase base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
base.setupFFT(dim, dimsize);
|
||||
#endif
|
||||
|
||||
// allocate memory on device
|
||||
int ierr;
|
||||
void *real_ptr, *comp_ptr, *real_res_ptr;
|
||||
real_ptr = base.allocateMemory<double>(sizereal, ierr);
|
||||
real_res_ptr = base.allocateMemory<double>(sizereal, ierr);
|
||||
comp_ptr = base.allocateMemory< std::complex<double> >(sizecomp, ierr);
|
||||
|
||||
// execute one run before starting the timers
|
||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||
|
||||
//timer for total loop time, FFT and IFFT calls
|
||||
struct timeval timeStart, timeEnd;
|
||||
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
|
||||
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i=0; i<loop; ++i){
|
||||
|
||||
// write data to device
|
||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||
|
||||
// execute rcfft
|
||||
gettimeofday(&timeFFTStart[i], NULL);
|
||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
gettimeofday(&timeFFTEnd[i], NULL);
|
||||
|
||||
// execute crfft
|
||||
gettimeofday(&timeIFFTStart[i], NULL);
|
||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||
gettimeofday(&timeIFFTEnd[i], NULL);
|
||||
|
||||
//normalize
|
||||
#ifdef DKS_CUDA
|
||||
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
||||
#endif
|
||||
|
||||
// read IFFT data from device
|
||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
|
||||
// free device memory
|
||||
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
||||
base.freeMemory<double>(real_ptr, sizereal);
|
||||
base.freeMemory<double>(real_res_ptr, sizereal);
|
||||
|
||||
// compare in and out data to see if we get back the same results
|
||||
compareData(rdata, outdata, N1, N2, N3, dim);
|
||||
|
||||
//calculate seconds for total time and fft times
|
||||
double tfft = 0;
|
||||
double tifft = 0;
|
||||
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
|
||||
|
||||
for (int i = 0; i < loop; i++) {
|
||||
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
|
||||
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
|
||||
|
||||
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
|
||||
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
|
||||
}
|
||||
|
||||
//print timing results
|
||||
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
|
||||
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
|
||||
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
|
||||
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
|
||||
<< "\n\n";
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
||||
int id;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < NI; i++) {
|
||||
for (int j = 0; j < NJ; j++) {
|
||||
for (int k = 0; k < NK; k++) {
|
||||
id = k*NI*NJ + j*NI + i;
|
||||
sum += fabs(data1[id] - data2[id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
||||
}
|
||||
|
||||
void initData(double *data, int dimsize[3]) {
|
||||
for (int i = 0; i < dimsize[2]; i++) {
|
||||
for (int j = 0; j < dimsize[1]; j++) {
|
||||
for (int k = 0; k < dimsize[0]; k++) {
|
||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void printHelp() {
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "testFFT3DRC executes 3D real complex and 3D complex real"
|
||||
<< "function on the Intel MIC.\n";
|
||||
std::cout << "Operations performed by testRC are: "
|
||||
<< "write data to MIC -> FFT -> IFFT -> read data from MIC.\n";
|
||||
std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z "
|
||||
<< "-loop $l\n";
|
||||
std::cout << "where $x $y $z are number of elements in each dimension and "
|
||||
<< "$l is the number of times all the operations will be performed.\n";
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
|
||||
if ( argv[i] == std::string("-grid") ) {
|
||||
N1 = atoi(argv[i + 1]);
|
||||
N2 = atoi(argv[i + 2]);
|
||||
N3 = atoi(argv[i + 3]);
|
||||
i += 3;
|
||||
}
|
||||
|
||||
if ( argv[i] == std::string("-loop") ) {
|
||||
loop = atoi(argv[i + 1]);
|
||||
i += 1;
|
||||
}
|
||||
|
||||
if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) {
|
||||
printHelp();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
220
test/testFFT3DRC_MIC.cpp
Normal file
220
test/testFFT3DRC_MIC.cpp
Normal file
@ -0,0 +1,220 @@
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
void printData3DN4(double* data, int N, int dim);
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
void compareData(double* data1, double* data2, int N, int dim);
|
||||
|
||||
/* Compute (K*L)%M accurately */
|
||||
static double moda(int K, int L, int M)
|
||||
{
|
||||
return (double)(((long long)K * L) % M);
|
||||
}
|
||||
/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */
|
||||
static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4)
|
||||
{
|
||||
double TWOPI = 6.2831853071795864769, phase, factor;
|
||||
int n1, n2, n3, S1, S2, S3, index;
|
||||
|
||||
/* Generalized strides for row-major addressing of x */
|
||||
S3 = 1;
|
||||
S2 = (N3/2+1)*2;
|
||||
S1 = N2*(N3/2+1)*2;
|
||||
|
||||
factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0;
|
||||
for (n1 = 0; n1 < N1; n1++)
|
||||
{
|
||||
for (n2 = 0; n2 < N2; n2++)
|
||||
{
|
||||
for (n3 = 0; n3 < N3; n3++)
|
||||
{
|
||||
phase = moda(n1,H1,N1) / N1;
|
||||
phase += moda(n2,H2,N2) / N2;
|
||||
phase += moda(n3,H3,N3) / N3;
|
||||
index = n1*S1 + n2*S2 + n3*S3;
|
||||
//cout << "index = " << index << endl;
|
||||
x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = atoi(argv[1]);
|
||||
int dim = 3;
|
||||
int dimsize[3] = {N, N, N};
|
||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||
int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2];
|
||||
|
||||
//double *rdata = new double[sizereal];
|
||||
//double *outdata = new double[sizereal];
|
||||
//complex<double> *cfft = new complex<double>[sizecomp];
|
||||
double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
|
||||
double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
|
||||
complex<double> *cfft = (complex<double> *)malloc(sizecomp*sizeof(complex<double>));
|
||||
|
||||
init_r(rdata, N,N,N);
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI("OpenMP", 6);
|
||||
base.setDevice("-mic", 4);
|
||||
base.initDevice();
|
||||
|
||||
/* setup forward fft (REAL->COMPLEX) */
|
||||
base.setupFFTRC(dim, dimsize);
|
||||
|
||||
int ierr;
|
||||
void *real_ptr, *comp_ptr;
|
||||
|
||||
/* allocate memory on device */;
|
||||
real_ptr = base.allocateMemory<double>(sizereal, ierr);
|
||||
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
|
||||
|
||||
/* write data to device */
|
||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||
|
||||
//printData3DN4(rdata,N,3);
|
||||
|
||||
/* execute rcfft */
|
||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
|
||||
/* read FFT data from device */
|
||||
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
||||
base.writeData<double>(comp_ptr, cfft, sizereal);
|
||||
|
||||
|
||||
/* setup backward fft (COMPLEX->REAL) */
|
||||
base.setupFFTCR(dim, dimsize,1./(N*N*N));
|
||||
/* execute crfft */
|
||||
base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
|
||||
/* normalize */
|
||||
//base.callNormalizeC2RFFT(real_ptr, dim, dimsize);
|
||||
|
||||
/* read FFT data from device */
|
||||
//base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
||||
|
||||
/* read IFFT data from device */
|
||||
base.readData<double>(real_ptr, outdata, sizereal);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
|
||||
base.freeMemory<double>(real_ptr, sizereal);
|
||||
|
||||
/* compare data */
|
||||
compareData(rdata, outdata, N, dim);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize) {
|
||||
int ni, nj, nk;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
if (!normalize)
|
||||
cout << data[i*ni*ni + j*nj + k].real() << "\t";
|
||||
else
|
||||
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
if (a < 10e-5 && a > -10e-5)
|
||||
a = 0;
|
||||
|
||||
cout << d << "; " << a << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
void printData3DN4(double* data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k];
|
||||
//double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
//if (a < 10e-5 && a > -10e-5)
|
||||
// a = 0;
|
||||
|
||||
cout << d << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
||||
void compareData(double* data1, double* data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
//sum += fabs(data1[id] - data2[id]/(N*N*N));
|
||||
sum += fabs(data1[id] - data2[id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
|
||||
}
|
159
test/testFFT3DSO.cpp
Normal file
159
test/testFFT3DSO.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
|
||||
/* usage - ./testFFT3D */
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 16;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 4) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, argv[3]);
|
||||
} else {
|
||||
N = 16;
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
cout << "Begin DKS Base tests, N = " << N << endl;
|
||||
|
||||
int dim = 3;
|
||||
complex<double> *cdata = new complex<double>[N*N*N];
|
||||
complex<double> *cfft = new complex<double>[N*N*N];
|
||||
complex<double> *cifft = new complex<double>[N*N*N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
|
||||
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(device_name));
|
||||
base.initDevice();
|
||||
base.setupFFT(3, dimsize);
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* allocate memory on device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
|
||||
/* write data to device */
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||
|
||||
/* compare results */
|
||||
compareData(cdata, cifft, N, dim);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize) {
|
||||
int ni, nj, nk;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
if (!normalize) {
|
||||
cout << data[i*ni*ni + j*nj + k].real() << " ";
|
||||
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
|
||||
} else
|
||||
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
if (a < 10e-5 && a > -10e-5)
|
||||
a = 0;
|
||||
|
||||
cout << d << "; " << a << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
130
test/testFFT3DTiming.cpp
Normal file
130
test/testFFT3DTiming.cpp
Normal file
@ -0,0 +1,130 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 4;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc > 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
N = atoi(argv[3]);
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
|
||||
cout << "Use api: " << api_name << endl;
|
||||
|
||||
cout << "Begin DKS Base tests, N = " << N << endl;
|
||||
|
||||
complex<double> *cdata = new complex<double>[N*N*N];
|
||||
complex<double> *cfft = new complex<double>[N*N*N];
|
||||
complex<double> *cifft = new complex<double>[N*N*N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cdata[i*N*N + j*N + k] = complex<double>((double)i / N, 0);
|
||||
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
timestamp_t t0, t1;
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* run stest funct to init device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||
/* end test */
|
||||
|
||||
int steps = 10;
|
||||
base.oclClearEvents();
|
||||
t0 = get_timestamp();
|
||||
for (int i = 0; i < steps; i++) {
|
||||
|
||||
/* allocate memory on device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
|
||||
/* write data to device */
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(mem_ptr, N);
|
||||
|
||||
//compareData(cdata, cifft, N, 3);
|
||||
}
|
||||
t1 = get_timestamp();
|
||||
|
||||
cout << "=========================" << endl;
|
||||
//base.oclEventInfo();
|
||||
cout << "Average total: " << get_secs(t0, t1) / steps << endl;
|
||||
cout << "=========================" << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
117
test/testFFTAsync.cpp
Normal file
117
test/testFFTAsync.cpp
Normal file
@ -0,0 +1,117 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include <cufft.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
void initData(double *data, int dimsize[3]) {
|
||||
for (int i = 0; i < dimsize[2]; i++) {
|
||||
for (int j = 0; j < dimsize[1]; j++) {
|
||||
for (int k = 0; k < dimsize[0]; k++) {
|
||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 8;
|
||||
if (argc == 2)
|
||||
N = atoi(argv[1]);
|
||||
|
||||
int N1 = N;
|
||||
int N2 = N;
|
||||
int N3 = N;
|
||||
int dim = 3;
|
||||
|
||||
int dimsize[3] = {N3, N2, N1};
|
||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||
int sizecomp = dimsize[0] * dimsize[1] * (dimsize[2]/2+1);
|
||||
|
||||
double *data1 = new double[sizereal];
|
||||
double *data2 = new double[sizereal];
|
||||
|
||||
initData(data1, dimsize);
|
||||
initData(data2, dimsize);
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
base.setupFFT(3, dimsize);
|
||||
|
||||
/* pagelock data */
|
||||
base.allocateHostMemory(data1, sizereal);
|
||||
base.allocateHostMemory(data2, sizereal);
|
||||
|
||||
/* create streams */
|
||||
int fft1, fft2;
|
||||
base.createStream(fft1);
|
||||
base.createStream(fft2);
|
||||
|
||||
int ierr;
|
||||
void *real_ptr1, *real_ptr2, *comp_ptr1, *comp_ptr2;
|
||||
|
||||
cout << "allocating memory ..." << endl;
|
||||
/* allocate memory on device */;
|
||||
real_ptr1 = base.allocateMemory<double>(sizereal, ierr);
|
||||
real_ptr2 = base.allocateMemory<double>(sizereal, ierr);
|
||||
comp_ptr1 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
|
||||
comp_ptr2 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
|
||||
|
||||
cufftHandle defaultPlan;
|
||||
cudaStream_t cfft1, cfft2;
|
||||
cufftPlan3d(&defaultPlan, N1, N2, N3, CUFFT_D2Z);
|
||||
cudaStreamCreate(&cfft1);
|
||||
cudaStreamCreate(&cfft2);
|
||||
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
|
||||
cufftHandle plan = defaultPlan;
|
||||
|
||||
cout << "Iteration: " << i << endl;
|
||||
/* write data to device */
|
||||
base.writeDataAsync<double>(real_ptr1, data1, sizereal, fft1);
|
||||
//cudaMemcpyAsync( (double*)real_ptr1,data1,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft1);
|
||||
|
||||
/* execute rcfft */
|
||||
base.callR2CFFT(real_ptr1, comp_ptr1, dim, dimsize, fft1);
|
||||
//cufftSetStream(plan, cfft1);
|
||||
//cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr1, (cufftDoubleComplex*)comp_ptr2);
|
||||
|
||||
/* write data to device */
|
||||
base.writeDataAsync<double>(real_ptr2, data2, sizereal, fft2);
|
||||
//cudaMemcpyAsync( (double*)real_ptr2,data2,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft2);
|
||||
|
||||
/* execute rcfft */
|
||||
base.callR2CFFT(real_ptr2, comp_ptr2, dim, dimsize, fft2);
|
||||
//cufftSetStream(plan, cfft2);
|
||||
//cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr2, (cufftDoubleComplex*)comp_ptr2);
|
||||
|
||||
}
|
||||
|
||||
base.freeMemory<double>(real_ptr1, sizereal);
|
||||
base.freeMemory<double>(real_ptr2, sizereal);
|
||||
base.freeMemory< complex<double> >(comp_ptr1, sizereal);
|
||||
base.freeMemory< complex<double> >(comp_ptr2, sizereal);
|
||||
|
||||
/* free pagelock data */
|
||||
base.freeHostMemory(data1, sizereal);
|
||||
base.freeHostMemory(data2, sizereal);
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
301
test/testFFTSolver.cpp
Normal file
301
test/testFFTSolver.cpp
Normal file
@ -0,0 +1,301 @@
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void printData3D(double* data, int N, int NI, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < NI; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cout << data[i*N*N + j*N + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void initData(double *data, int N) {
|
||||
|
||||
for (int i = 0; i < N/4 + 1; i++) {
|
||||
for (int j = 0; j < N/2 + 1; j++) {
|
||||
for (int k = 0; k < N/2 + 1; k++) {
|
||||
data[i*N*N + j*N + k] = k+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void initData2(double *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = i;
|
||||
}
|
||||
|
||||
void initComplex( complex<double> *d, int N) {
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
d[i] = complex<double>(2, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void printComplex(complex<double> *d, int N) {
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << d[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void initMirror(double *data, int n1, int n2, int n3) {
|
||||
int d = 1;
|
||||
for (int i = 0; i < n3; i++) {
|
||||
for (int j = 0; j < n2; j++) {
|
||||
for (int k = 0; k < n1; k++) {
|
||||
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
||||
data[i * n2 * n1 + j * n1 + k] = d++;
|
||||
else
|
||||
data[i * n2 * n1 + j * n1 + k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void printDiv(int c) {
|
||||
for (int i = 0; i < c; i++)
|
||||
cout << "-";
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void printMirror(double *data, int n1, int n2, int n3) {
|
||||
|
||||
printDiv(75);
|
||||
for (int i = 0; i < n3; i++) {
|
||||
for (int j = 0; j < n2; j++) {
|
||||
for (int k = 0; k < n1; k++) {
|
||||
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
double sumData(double *data, int datasize) {
|
||||
|
||||
double sum = 0;
|
||||
for (int i = 0; i < datasize; i++)
|
||||
sum += data[i];
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
/* mpi init */
|
||||
int rank, nprocs;
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
if (nprocs != 8) {
|
||||
cout << "example was set to run with 8 processes" << endl;
|
||||
cout << "exit..." << endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* set domain size */
|
||||
int NG[3] = {64, 64, 32};
|
||||
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
||||
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
||||
int sizerho = NG[0] * NG[1] * NG[2];
|
||||
int sizegreen = ng[0] * ng[1] * ng[2];
|
||||
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
||||
int id[3];
|
||||
|
||||
id[0] = 0;
|
||||
id[1] = NL[1] * (rank % 4);
|
||||
id[2] = NL[2] * (rank / 4);
|
||||
|
||||
/* print some messages bout the example in the begginig */
|
||||
if (rank == 0) {
|
||||
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
||||
cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
|
||||
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
||||
cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
|
||||
int tmp[3];
|
||||
for (int p = 1; p < nprocs; p++) {
|
||||
MPI_Status mpistatus;
|
||||
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
|
||||
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
|
||||
}
|
||||
} else {
|
||||
MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
/* dks init and create 2 streams */
|
||||
int dkserr;
|
||||
int streamGreens, streamFFT;
|
||||
DKSBase base;// = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
base.createStream(streamFFT);
|
||||
if (rank == 0) {
|
||||
base.createStream(streamGreens);
|
||||
base.setupFFT(3, NG);
|
||||
}
|
||||
|
||||
/* allocate memory and init rho field */
|
||||
double *rho = new double[sizerho];
|
||||
double *rho_out = new double[sizerho];
|
||||
//double *green_out = new double[sizegreen];
|
||||
initMirror(rho, NL[0], NL[1], NL[2]);
|
||||
|
||||
/*
|
||||
allocate memory on device for
|
||||
- rho field
|
||||
- rho FFT
|
||||
- tmpgreen
|
||||
- greens integral
|
||||
- greens integral FFT
|
||||
*/
|
||||
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
||||
if (rank == 0) {
|
||||
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
||||
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||
} else {
|
||||
grntr_ptr = NULL;
|
||||
rho2_ptr = NULL;
|
||||
grn_ptr = NULL;
|
||||
rho2tr_ptr = NULL;
|
||||
tmpgreen_ptr = NULL;
|
||||
}
|
||||
|
||||
/* send and receive pointer to allocated memory on device */
|
||||
if (rank == 0) {
|
||||
for (int p = 1; p < nprocs; p++)
|
||||
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
|
||||
} else {
|
||||
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
|
||||
}
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
/* =====loop trough fftpoison solver iterations=====*/
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
|
||||
double old_sum = 0;
|
||||
double tmp_sum = 0;
|
||||
for (int l = 0; l < 10000; l++) {
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
/* on node 0, calculate tmpgreen on gpu */
|
||||
int hr_m[3] = {1, 1, 1};
|
||||
if (rank == 0)
|
||||
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
|
||||
hr_m[0], hr_m[1], hr_m[2], streamGreens);
|
||||
|
||||
/* calculate greens integral on gpu */
|
||||
if (rank == 0)
|
||||
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2], streamGreens);
|
||||
|
||||
/* mirror the field */
|
||||
if (rank == 0)
|
||||
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2], streamGreens);
|
||||
|
||||
|
||||
/* get FFT of mirrored greens integral */
|
||||
if (rank == 0)
|
||||
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG, streamGreens);
|
||||
|
||||
/* transfer rho field to device */
|
||||
base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
/* get FFT of rho field */
|
||||
if (rank == 0) {
|
||||
base.syncDevice();
|
||||
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
||||
}
|
||||
|
||||
/* multiply both FFTs */
|
||||
if (rank == 0)
|
||||
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
/* inverse fft and transfer data back */
|
||||
/*
|
||||
multiple device syncs and mpi barriers are used to make sure data
|
||||
transfer is started when results are ready and progam moves on
|
||||
only when data transfer is finished
|
||||
*/
|
||||
if (rank == 0) {
|
||||
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
||||
base.syncDevice();
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.syncDevice();
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
//cout << "result: " << sumData(rho_out, sizerho) << endl;
|
||||
if (l == 0) {
|
||||
old_sum = sumData(rho_out, sizerho);
|
||||
} else {
|
||||
tmp_sum = sumData(rho_out, sizerho);
|
||||
if (old_sum != tmp_sum) {
|
||||
cout << "diff in iteration: " << l << endl;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
/* ==========end fftpoison solver test run==========*/
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
|
||||
|
||||
|
||||
/* free memory on device */
|
||||
if (rank == 0) {
|
||||
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
||||
base.freeMemory<double>(grn_ptr, sizerho);
|
||||
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
||||
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.freeMemory<double>(rho2_ptr, sizerho);
|
||||
cout << "Final sum: " << old_sum << endl;
|
||||
} else {
|
||||
base.closeHandle(rho2_ptr);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
|
||||
|
||||
}
|
319
test/testFFTSolver_MIC.cpp
Normal file
319
test/testFFTSolver_MIC.cpp
Normal file
@ -0,0 +1,319 @@
|
||||
#include <iostream>
|
||||
//#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void printData3D(double* data, int N, int NI, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < NI; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cout << data[i*N*N + j*N + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void initData(double *data, int N) {
|
||||
|
||||
for (int i = 0; i < N/4 + 1; i++) {
|
||||
for (int j = 0; j < N/2 + 1; j++) {
|
||||
for (int k = 0; k < N/2 + 1; k++) {
|
||||
data[i*N*N + j*N + k] = k+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void initData2(double *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = i;
|
||||
}
|
||||
|
||||
void initComplex( complex<double> *d, int N) {
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
d[i] = complex<double>(2, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void printComplex(complex<double> *d, int N) {
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << d[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void initMirror(double *data, int n1, int n2, int n3) {
|
||||
int d = 1;
|
||||
for (int i = 0; i < n3; i++) {
|
||||
for (int j = 0; j < n2; j++) {
|
||||
for (int k = 0; k < n1; k++) {
|
||||
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
||||
data[i * n2 * n1 + j * n1 + k] = d++;
|
||||
else
|
||||
data[i * n2 * n1 + j * n1 + k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void printDiv(int c) {
|
||||
for (int i = 0; i < c; i++)
|
||||
cout << "-";
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void printMirror(double *data, int n1, int n2, int n3) {
|
||||
|
||||
printDiv(75);
|
||||
for (int i = 0; i < n3; i++) {
|
||||
for (int j = 0; j < n2; j++) {
|
||||
for (int k = 0; k < n1; k++) {
|
||||
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
double sumData(double *data, int datasize) {
|
||||
|
||||
double sum = 0;
|
||||
for (int i = 0; i < datasize; i++)
|
||||
sum += data[i];
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
/* mpi init */
|
||||
//int rank, nprocs;
|
||||
//MPI_Init(&argc, &argv);
|
||||
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
/*
|
||||
if (nprocs != 8) {
|
||||
cout << "example was set to run with 8 processes" << endl;
|
||||
cout << "exit..." << endl;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/* set domain size */
|
||||
int NG[3] = {64, 64, 32};
|
||||
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
||||
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
||||
int sizerho = NG[0] * NG[1] * NG[2];
|
||||
int sizegreen = ng[0] * ng[1] * ng[2];
|
||||
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
||||
int id[3];
|
||||
|
||||
//id[0] = 0;
|
||||
//id[1] = NL[1] * (rank % 4);
|
||||
//id[2] = NL[2] * (rank / 4);
|
||||
|
||||
/* print some messages bout the example in the begginig */
|
||||
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
||||
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
|
||||
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
||||
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
|
||||
int tmp[3];
|
||||
/* for (int p = 1; p < nprocs; p++) {
|
||||
MPI_Status mpistatus;
|
||||
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
|
||||
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
|
||||
}*/
|
||||
// } else {
|
||||
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
|
||||
// }
|
||||
|
||||
/* dks init and create 2 streams */
|
||||
int dkserr;
|
||||
//int streamGreens, streamFFT;
|
||||
#ifdef DKS_MIC
|
||||
DKSBase base;
|
||||
base.setAPI("OpenMP", 6);
|
||||
base.setDevice("-mic", 4);
|
||||
base.initDevice();
|
||||
#endif
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
DKSBase base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
#endif
|
||||
|
||||
//base.createStream(streamFFT);
|
||||
//if (rank == 0) {
|
||||
// base.createStream(streamGreens);
|
||||
base.setupFFT(3, NG);
|
||||
//}
|
||||
|
||||
/* allocate memory and init rho field */
|
||||
double *rho = new double[sizerho];
|
||||
double *rho_out = new double[sizerho];
|
||||
//double *green_out = new double[sizegreen];
|
||||
initMirror(rho, NL[0], NL[1], NL[2]);
|
||||
|
||||
/*
|
||||
allocate memory on device for
|
||||
- rho field
|
||||
- rho FFT
|
||||
- tmpgreen
|
||||
- greens integral
|
||||
- greens integral FFT
|
||||
*/
|
||||
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
||||
// if (rank == 0) {
|
||||
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
||||
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||
/* } else {
|
||||
grntr_ptr = NULL;
|
||||
rho2_ptr = NULL;
|
||||
grn_ptr = NULL;
|
||||
rho2tr_ptr = NULL;
|
||||
tmpgreen_ptr = NULL;
|
||||
}*/
|
||||
|
||||
|
||||
/* send and receive pointer to allocated memory on device */
|
||||
/*
|
||||
if (rank == 0) {
|
||||
for (int p = 1; p < nprocs; p++)
|
||||
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
|
||||
} else {
|
||||
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
|
||||
}
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
*/
|
||||
|
||||
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
/* =====loop trough fftpoison solver iterations=====*/
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
|
||||
double old_sum = 0;
|
||||
double tmp_sum = 0;
|
||||
for (int l = 0; l < 100; l++) {
|
||||
//MPI_Barrier(MPI_COMM_WORLD);
|
||||
/* on node 0, calculate tmpgreen on gpu */
|
||||
int hr_m[3] = {1, 1, 1};
|
||||
//if (rank == 0)
|
||||
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
|
||||
hr_m[0], hr_m[1], hr_m[2]);
|
||||
|
||||
/* calculate greens integral on gpu */
|
||||
//if (rank == 0)
|
||||
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
|
||||
|
||||
/* mirror the field */
|
||||
//if (rank == 0)
|
||||
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
|
||||
|
||||
|
||||
/* get FFT of mirrored greens integral */
|
||||
//if (rank == 0)
|
||||
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
|
||||
|
||||
/* transfer rho field to device */
|
||||
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
|
||||
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
|
||||
//MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
/* get FFT of rho field */
|
||||
//if (rank == 0) {
|
||||
//base.syncDevice();
|
||||
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
||||
//}
|
||||
|
||||
/* multiply both FFTs */
|
||||
//if (rank == 0)
|
||||
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
||||
//MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
/* inverse fft and transfer data back */
|
||||
/*
|
||||
multiple device syncs and mpi barriers are used to make sure data
|
||||
transfer is started when results are ready and progam moves on
|
||||
only when data transfer is finished
|
||||
*/
|
||||
//if (rank == 0) {
|
||||
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
||||
//base.syncDevice();
|
||||
//MPI_Barrier(MPI_COMM_WORLD);
|
||||
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
|
||||
//MPI_Barrier(MPI_COMM_WORLD);
|
||||
//base.syncDevice();
|
||||
//MPI_Barrier(MPI_COMM_WORLD);
|
||||
//cout << "result: " << sumData(rho_out, sizerho) << endl;
|
||||
if (l == 0) {
|
||||
old_sum = sumData(rho_out, sizerho);
|
||||
} else {
|
||||
tmp_sum = sumData(rho_out, sizerho);
|
||||
if (old_sum != tmp_sum) {
|
||||
cout << "diff in iteration: " << l << endl;
|
||||
}
|
||||
}
|
||||
/*} else {
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
}
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
/* ==========end fftpoison solver test run==========*/
|
||||
/* =================================================*/
|
||||
/* =================================================*/
|
||||
|
||||
|
||||
|
||||
/* free memory on device */
|
||||
//if (rank == 0) {
|
||||
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
||||
base.freeMemory<double>(grn_ptr, sizerho);
|
||||
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
||||
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
||||
//MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.freeMemory<double>(rho2_ptr, sizerho);
|
||||
cout << "Final sum: " << old_sum << endl;
|
||||
/*} else {
|
||||
base.closeHandle(rho2_ptr);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}*/
|
||||
|
||||
//MPI_Finalize();
|
||||
|
||||
|
||||
}
|
172
test/testGather.cpp
Normal file
172
test/testGather.cpp
Normal file
@ -0,0 +1,172 @@
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void printData3D(int* data, int N, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cout << data[i*N*N + j*N + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
|
||||
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < nz; i++) {
|
||||
for (int j = 0; j < ny; j++) {
|
||||
for (int k = 0; k < nx; k++) {
|
||||
cout << data[i*ny*nx + j*nx + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void printData(int *data, int N, int nprocs, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < nprocs; i++) {
|
||||
for (int j = 0; j < N; j++)
|
||||
cout << data[i*N + j] << "\t";
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initData(int *data, int N, int rank) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = (rank+1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
int rank, nprocs;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
|
||||
|
||||
int N_global[3] = {64, 64, 32};
|
||||
int N_local[3] = {64, 32, 16};
|
||||
int n = N_local[0] * N_local[1] * N_local[2];
|
||||
|
||||
int idx[4] = {0, 0, 0, 0};
|
||||
int idy[4] = {0, 32, 0, 32};
|
||||
int idz[4] = {0, 0, 16, 16};
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
|
||||
int *hdata_in;
|
||||
if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
|
||||
hdata_in = new int[n];
|
||||
cout << "pinned allocation failed!" << endl;
|
||||
}
|
||||
initData(hdata_in, n, rank);
|
||||
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (i == 1)
|
||||
nvtxMarkA("start gather");
|
||||
|
||||
if (rank == 0) {
|
||||
|
||||
void *mem_ptr, *tmpgreen_ptr;
|
||||
|
||||
mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
|
||||
|
||||
//call another kernel
|
||||
int sizegreen = 33 * 33 * 17;
|
||||
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);
|
||||
nvtxMarkA("call green");
|
||||
base.callGreensIntegral(tmpgreen_ptr, 33, 33, 17, 33, 33, 0.001, 0.001, 0.00007);
|
||||
|
||||
nvtxMarkA("call gather");
|
||||
base.gather3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local,
|
||||
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
|
||||
|
||||
//read and print data once for debug only
|
||||
/*
|
||||
if (i == 0 && nprocs*n < 257) {
|
||||
int *hdata_out_all = new int[nprocs*n];
|
||||
base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
|
||||
printData3D2(hdata_out_all, N_global[0], N_global[1], N_global[2]);
|
||||
}
|
||||
|
||||
else {
|
||||
int *hout_data = new int[nprocs*n];
|
||||
base.readData<int>(mem_ptr, hout_data, nprocs*n);
|
||||
int sum = 0;
|
||||
for (int s = 0; s < nprocs*n; s++)
|
||||
sum += hout_data[s];
|
||||
|
||||
cout << "Sum: " << sum << endl;
|
||||
}
|
||||
*/
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
nvtxMarkA("call scatter");
|
||||
base.scatter3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local,
|
||||
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
|
||||
|
||||
base.freeMemory<int>(mem_ptr, n*nprocs);
|
||||
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
||||
|
||||
} else {
|
||||
|
||||
nvtxMarkA("call gather");
|
||||
base.gather3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local,
|
||||
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
nvtxMarkA("call scatter");
|
||||
base.scatter3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local,
|
||||
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
if (i == 1)
|
||||
nvtxMarkA("end gather");
|
||||
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.freeHostMemory(hdata_in, n);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
144
test/testGatherAsync.cpp
Normal file
144
test/testGatherAsync.cpp
Normal file
@ -0,0 +1,144 @@
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void printData3D(int* data, int N, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cout << data[i*N*N + j*N + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void printData(int *data, int N, int nprocs, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < nprocs; i++) {
|
||||
for (int j = 0; j < N; j++)
|
||||
cout << data[i*N + j] << "\t";
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initData(int *data, int N, int rank) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = (rank+1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
int rank, nprocs;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
|
||||
|
||||
//mpi copy
|
||||
int n = 32*16*16;
|
||||
int N_global[3] = {32, 32, 32};
|
||||
int N_local[3] = {32, 16, 16};
|
||||
int idx[4] = {0, 0, 0, 0};
|
||||
int idy[4] = {0, 0, 16, 16};
|
||||
int idz[4] = {0, 16, 0, 16};
|
||||
|
||||
//greens kernel
|
||||
int n1 = 33;
|
||||
int n2 = 33;
|
||||
int n3 = 17;
|
||||
int sizegreen = n1*n2*n3;
|
||||
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
int *hdata_in;
|
||||
if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
|
||||
hdata_in = new int[n];
|
||||
cout << "pinned allocation failed!" << endl;
|
||||
}
|
||||
initData(hdata_in, n, rank);
|
||||
|
||||
int stream2;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
|
||||
if (rank == 0) {
|
||||
if (i == 0) {
|
||||
cudaProfilerStart();
|
||||
base.createStream(stream2);
|
||||
}
|
||||
|
||||
nvtxMarkA("start gather");
|
||||
|
||||
void *mem_ptr, *green_ptr;
|
||||
|
||||
mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
|
||||
green_ptr = base.allocateMemory<int>(sizegreen, ierr);
|
||||
|
||||
nvtxMarkA("call gather");
|
||||
MPI_Request request;
|
||||
MPI_Status status;
|
||||
|
||||
base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local,
|
||||
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD,
|
||||
request);
|
||||
|
||||
|
||||
nvtxMarkA("call kernel");
|
||||
base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1,
|
||||
4.160715e-03, 4.474911e-03, 1.247311e-02, stream2);
|
||||
|
||||
MPI_Wait(&request, &status);
|
||||
|
||||
|
||||
base.freeMemory<int>(mem_ptr, n*nprocs);
|
||||
base.freeMemory<int>(green_ptr, sizegreen);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
nvtxMarkA("end gather");
|
||||
|
||||
if (i == 1) cudaProfilerStop();
|
||||
} else {
|
||||
|
||||
MPI_Request request;
|
||||
base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local,
|
||||
idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD,
|
||||
request);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
base.freeHostMemory(hdata_in, n);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
205
test/testGatherAsync2.cpp
Normal file
205
test/testGatherAsync2.cpp
Normal file
@ -0,0 +1,205 @@
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void printData3D(int* data, int N, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cout << data[i*N*N + j*N + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
|
||||
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < nz; i++) {
|
||||
for (int j = 0; j < ny; j++) {
|
||||
for (int k = 0; k < nx; k++) {
|
||||
cout << data[i*ny*nx + j*nx + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void printData(int *data, int N, int nprocs, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < nprocs*N; i++)
|
||||
cout << data[i] << "\t";
|
||||
cout << endl << endl;
|
||||
|
||||
}
|
||||
|
||||
void initData(int *data, int N, int rank) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = (rank+1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
int rank, nprocs;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
//cout << "Rank " << (rank+1) << " from " << nprocs << endl;
|
||||
|
||||
int Ng[3] = {128, 128, 64};
|
||||
int Nl[3] = {128, 64, 32};
|
||||
int nglobal = Ng[0] * Ng[1] * Ng[2];
|
||||
int nlocal = Nl[0] * Nl[1] * Nl[2];
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
int *hdata_in;
|
||||
if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) {
|
||||
hdata_in = new int[nlocal];
|
||||
cout << "pinned allocation failed!" << endl;
|
||||
}
|
||||
initData(hdata_in, nlocal, rank);
|
||||
|
||||
int *hdata_out;
|
||||
if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) {
|
||||
hdata_out = new int[nlocal];
|
||||
cout << "pinned allocation failed!" << endl;
|
||||
}
|
||||
|
||||
//create streams for async execution
|
||||
int stream1, stream2;
|
||||
base.createStream(stream1);
|
||||
base.createStream(stream2);
|
||||
|
||||
if (rank == 0)
|
||||
base.setupFFT(3, Ng);
|
||||
|
||||
for (int i = 0; i < 1; i++) {
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (i == 1)
|
||||
nvtxMarkA("start gather");
|
||||
|
||||
if (rank == 0) {
|
||||
|
||||
int id[3] = {0, 0, 0};
|
||||
|
||||
void *mem_ptr, *tmpgreen_ptr, *comp_ptr;
|
||||
|
||||
//allocate memory on device
|
||||
int sizegreen = 65 * 65 * 33;
|
||||
int sizecomp = 65 * 128 * 64;
|
||||
mem_ptr = base.allocateMemory<double>(nglobal, ierr);
|
||||
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);
|
||||
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
|
||||
|
||||
//send pointer to other processes
|
||||
nvtxMarkA("call gather");
|
||||
for (int j = 1; j < nprocs; j++)
|
||||
base.sendPointer(mem_ptr, j, MPI_COMM_WORLD);
|
||||
|
||||
//call another kernel while data transfer is processing
|
||||
nvtxMarkA("call green");
|
||||
base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2);
|
||||
|
||||
//write data to device
|
||||
base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
|
||||
|
||||
/* execute rcfft */
|
||||
//base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng);
|
||||
|
||||
base.syncDevice();
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
//read data from device
|
||||
base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.syncDevice();
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
|
||||
base.freeMemory<double>(mem_ptr, nglobal);
|
||||
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
||||
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
|
||||
|
||||
} else {
|
||||
|
||||
|
||||
void *mem_ptr;
|
||||
int idy = 0;
|
||||
int idz = 0;//Nl[2]*rank;
|
||||
if (rank / 2 == 1) idy = Ng[1] / 2;
|
||||
if (rank % 2 == 1) idz = Ng[2] / 2;
|
||||
int id[3] = {0, idy, idz};
|
||||
|
||||
nvtxMarkA("call gather");
|
||||
mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr);
|
||||
base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
base.closeHandle(mem_ptr);
|
||||
|
||||
}
|
||||
|
||||
int sum1 = 0;
|
||||
for (int c = 0; c < nlocal; c++)
|
||||
sum1 += hdata_in[c];
|
||||
|
||||
int sum2 = 0;
|
||||
for (int c = 0; c < nlocal; c++)
|
||||
sum2 += hdata_out[c];
|
||||
|
||||
cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl;
|
||||
|
||||
|
||||
if (i == 1)
|
||||
nvtxMarkA("end gather");
|
||||
|
||||
}
|
||||
|
||||
//printData(hdata_in, nlocal, 1);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
base.freeHostMemory(hdata_in, nlocal);
|
||||
//delete[] hdata_in;
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
239
test/testGreens.cpp
Normal file
239
test/testGreens.cpp
Normal file
@ -0,0 +1,239 @@
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
#include <complex>
|
||||
|
||||
#include "DKSBase.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void printData3D(double* data, int N, int NI, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < NI; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cout << data[i*N*N + j*N + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void initData(double *data, int N) {
|
||||
|
||||
for (int i = 0; i < N/4 + 1; i++) {
|
||||
for (int j = 0; j < N/2 + 1; j++) {
|
||||
for (int k = 0; k < N/2 + 1; k++) {
|
||||
data[i*N*N + j*N + k] = k+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void initData2(double *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = i;
|
||||
}
|
||||
|
||||
void initComplex( complex<double> *d, int N) {
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
d[i] = complex<double>(2, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void printComplex(complex<double> *d, int N) {
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << d[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void initMirror(double *data, int n1, int n2, int n3) {
|
||||
int d = 1;
|
||||
for (int i = 0; i < n3; i++) {
|
||||
for (int j = 0; j < n2; j++) {
|
||||
for (int k = 0; k < n1; k++) {
|
||||
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
||||
data[i * n2 * n1 + j * n1 + k] = d++;
|
||||
else
|
||||
data[i * n2 * n1 + j * n1 + k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void printDiv(int c) {
|
||||
for (int i = 0; i < c; i++)
|
||||
cout << "-";
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void printMirror(double *data, int n1, int n2, int n3) {
|
||||
|
||||
printDiv(75);
|
||||
for (int i = 0; i < n3; i++) {
|
||||
for (int j = 0; j < n2; j++) {
|
||||
for (int k = 0; k < n1; k++) {
|
||||
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
double sumData(double *data, int datasize) {
|
||||
|
||||
double sum = 0;
|
||||
for (int i = 0; i < datasize; i++)
|
||||
sum += data[i];
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
|
||||
int N1 = 8;
|
||||
int N2 = 8;
|
||||
int N3 = 4;
|
||||
|
||||
int n1 = N1 / 2;
|
||||
int n2 = N2 / 2;
|
||||
int n3 = N3 / 2;
|
||||
|
||||
int sizegreen = (n1 + 1) * (n2 + 1) * (n3 + 1);
|
||||
int sizerho = N1 * N2 * N3;
|
||||
|
||||
double *data_green; //= new double[sizegreen];
|
||||
double *data_rho; //= new double[sizerho];
|
||||
|
||||
double hr_m0 = +4.0264984513873269e-04;
|
||||
double hr_m1 = +4.3305596731911289e-04;
|
||||
double hr_m2 = +8.3154085085560838e-04;
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
|
||||
int stream1, stream2;
|
||||
base.createStream(stream1);
|
||||
base.createStream(stream2);
|
||||
cout << "ID stream1: " << stream1 << endl;
|
||||
cout << "ID stream2: " << stream2 << endl;
|
||||
|
||||
void *mem_green1, *mem_green2, *mem_rho1, *mem_rho2;
|
||||
|
||||
mem_green1 = base.allocateMemory<double>(sizegreen, ierr);
|
||||
mem_green2 = base.allocateMemory<double>(sizegreen, ierr);
|
||||
mem_rho1 = base.allocateMemory<double>(sizerho, ierr);
|
||||
mem_rho2 = base.allocateMemory<double>(sizerho, ierr);
|
||||
|
||||
printDiv(50);
|
||||
|
||||
data_green = new double[sizegreen];
|
||||
data_rho = new double[sizerho];
|
||||
|
||||
base.callGreensIntegral(mem_green1, n1+1, n2+1, n3+1, n1+1, n2+1,
|
||||
hr_m0, hr_m1, hr_m2, stream1);
|
||||
base.readData<double>(mem_green1, data_green, sizegreen);
|
||||
cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
|
||||
cout << scientific << setprecision(16);
|
||||
for (int p = 0; p < 7; p++)
|
||||
cout << data_green[p] << "\t";
|
||||
cout << endl;
|
||||
//printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
|
||||
|
||||
base.callGreensIntegration(mem_rho1, mem_green1, n1 + 1, n2 + 1, n3 + 1, -1);
|
||||
base.readData<double>(mem_rho1, data_rho, sizerho);
|
||||
cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
|
||||
//printMirror(data_rho, N1, N2, N3);
|
||||
|
||||
base.callMirrorRhoField(mem_rho1, n1, n2, n3, -1);
|
||||
base.readData<double>(mem_rho1, data_rho, sizerho);
|
||||
cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
|
||||
//printMirror(data_rho, N1, N2, N3);
|
||||
|
||||
printDiv(50);
|
||||
|
||||
/*
|
||||
base.callGreensIntegral(mem_green2, n1+1, n2+1, n3+1, n1+1, n2+1,
|
||||
1, 1, 1, -2);
|
||||
base.readData<double>(mem_green2, data_green, sizegreen);
|
||||
cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
|
||||
//printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
|
||||
|
||||
base.callGreensIntegration(mem_rho2, mem_green2, n1 + 1, n2 + 1, n3 + 1, -2);
|
||||
base.readData<double>(mem_rho2, data_rho, sizerho);
|
||||
cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
|
||||
//printMirror(data_rho, N1, N2, N3);
|
||||
|
||||
base.callMirrorRhoField(mem_rho2, n1, n2, n3, -2);
|
||||
base.readData<double>(mem_rho2, data_rho, sizerho);
|
||||
cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
|
||||
//printMirror(data_rho, N1, N2, N3);
|
||||
*/
|
||||
printDiv(50);
|
||||
|
||||
base.freeMemory<double>(mem_green1, sizegreen);
|
||||
base.freeMemory<double>(mem_green2, sizegreen);
|
||||
base.freeMemory<double>(mem_rho1, sizerho);
|
||||
base.freeMemory<double>(mem_rho2, sizerho);
|
||||
|
||||
delete [] data_green;
|
||||
delete [] data_rho;
|
||||
|
||||
//test complex multiplication
|
||||
int compsize = 300;
|
||||
complex<double> *data1 = new complex<double>[compsize];
|
||||
complex<double> *data2 = new complex<double>[compsize];
|
||||
for (int i = 0; i < compsize; i++) {
|
||||
data1[i] = complex<double>(i+1, i+2);
|
||||
data2[i] = complex<double>(i+3, i+4);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
cout << data1[i] << "\t";
|
||||
cout << endl;
|
||||
for (int i = 0; i < 3; i++)
|
||||
cout << data2[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
void *ptr1, *ptr2;
|
||||
ptr1 = base.allocateMemory< complex<double> >(compsize, ierr);
|
||||
ptr2 = base.allocateMemory< complex<double> >(compsize, ierr);
|
||||
|
||||
base.writeData< complex<double> >(ptr1, data1, compsize);
|
||||
base.writeData< complex<double> >(ptr2, data2, compsize);
|
||||
|
||||
base.callMultiplyComplexFields(ptr1, ptr2, compsize);
|
||||
|
||||
base.readData< complex<double> >(ptr1, data1, compsize);
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
cout << data1[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
base.freeMemory< complex<double> >(ptr1, compsize);
|
||||
base.freeMemory< complex<double> >(ptr2, compsize);
|
||||
|
||||
return 0;
|
||||
}
|
191
test/testImageReconstruction.cpp
Normal file
191
test/testImageReconstruction.cpp
Normal file
@ -0,0 +1,191 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <sys/time.h>
|
||||
#include "DKSImageReconstruction.h"
|
||||
|
||||
struct voxelPosition {
|
||||
float x;
|
||||
float y;
|
||||
float z;
|
||||
};
|
||||
|
||||
void initImage(float *image, int size) {
|
||||
for (int i = 0; i < size; i++)
|
||||
image[i] = (float)rand() / RAND_MAX;
|
||||
}
|
||||
|
||||
void initPosition(voxelPosition *voxel, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
int idx = i * N * N + j * N + k;
|
||||
if (k == 0)
|
||||
voxel[idx].x = 0.0;
|
||||
else
|
||||
voxel[idx].x = voxel[idx - 1].x + 0.1;
|
||||
|
||||
if (j == 0)
|
||||
voxel[idx].y = 0.0;
|
||||
else
|
||||
voxel[idx].y = voxel[idx - N].y + 0.1;
|
||||
|
||||
if (i == 0)
|
||||
voxel[idx].z = 0.0;
|
||||
else
|
||||
voxel[idx].z = voxel[idx - N * N].z + 0.1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void printPosition(voxelPosition *voxel, int size) {
|
||||
for (int i = 0; i < size; i++)
|
||||
std::cout << voxel[i].x << "\t";
|
||||
std::cout << std::endl;
|
||||
for (int i = 0; i < size; i++)
|
||||
std::cout << voxel[i].y << "\t";
|
||||
std::cout << std::endl;
|
||||
for (int i = 0; i < size; i++)
|
||||
std::cout << voxel[i].z << "\t";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
#define DIAMETER 2.0
|
||||
bool select_source(voxelPosition *image_tmp, voxelPosition source_temp, int id)
|
||||
{
|
||||
float distance_x = pow(image_tmp[id].x-source_temp.x,2);
|
||||
float distance_y = pow(image_tmp[id].y-source_temp.y,2);
|
||||
float distance_z = pow(image_tmp[id].z-source_temp.z,2);
|
||||
float distance = sqrt(distance_x + distance_y + distance_z);
|
||||
|
||||
if ( distance < DIAMETER*0.5 ) {
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
void calculate_source(float *image_space , voxelPosition *image_geometry,
|
||||
voxelPosition source, int total_voxels,
|
||||
float *average, float *std)
|
||||
{
|
||||
|
||||
int number_selected_maximum = 10000;
|
||||
float *select;
|
||||
select = new float[number_selected_maximum];
|
||||
for (int j=0;j<number_selected_maximum;j++)
|
||||
select[j] = 0.0;
|
||||
int number_selected=0;
|
||||
|
||||
for (int voxel_id = 0; voxel_id < total_voxels; voxel_id++) {
|
||||
if ( select_source( image_geometry, source, voxel_id ) ) {
|
||||
select[number_selected] = image_space[voxel_id];
|
||||
number_selected += 1;
|
||||
}
|
||||
}
|
||||
|
||||
*average = 0.0;
|
||||
*std = 0.0;
|
||||
|
||||
for (int j=0;j<number_selected;j++)
|
||||
*average += select[j];
|
||||
*average /= float(number_selected);
|
||||
|
||||
for (int j=0;j<number_selected;j++)
|
||||
*std += pow(*average-select[j],2);
|
||||
*std = sqrt(*std/number_selected/(number_selected-1));
|
||||
|
||||
delete[] select;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 8;
|
||||
if (argc == 2)
|
||||
N = atoi(argv[1]);
|
||||
|
||||
double ttotal;
|
||||
struct timeval timeStart, timeEnd;
|
||||
|
||||
int total = N*N*N;
|
||||
float *image = new float[total];
|
||||
voxelPosition *geometry = new voxelPosition[total];
|
||||
|
||||
initImage(image, total);
|
||||
initPosition(geometry, N);
|
||||
|
||||
voxelPosition source;
|
||||
float avg[total], stdev[total];
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i = 0; i < total; i++) {
|
||||
source.x = geometry[i].x;
|
||||
source.y = geometry[i].y;
|
||||
source.z = geometry[i].z;
|
||||
calculate_source(image , geometry, source, total, &avg[i], &stdev[i]);
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
|
||||
|
||||
float avgavg = 0;
|
||||
float avgstdev = 0;
|
||||
for (int i = 0; i < total; i++) {
|
||||
avgavg += avg[i] / total;
|
||||
avgstdev += stdev[i] / total;
|
||||
}
|
||||
|
||||
std::cout << "Total voxels: " << N*N*N << std::endl;
|
||||
std::cout << "Dimensions [" << geometry[0].x << ":" << geometry[N-1].x << "]"
|
||||
<< "[" << geometry[0].y << ":" << geometry[N*N-1].x << "]"
|
||||
<< "[" << geometry[0].z << ":" << geometry[N*N*N-1].x << "]" << std::endl;
|
||||
std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
|
||||
|
||||
|
||||
void *image_space, *image_position, *source_position, *davg, *dstd;
|
||||
|
||||
int ierr;
|
||||
DKSImageRecon base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
image_space = base.allocateMemory<float>(total, ierr);
|
||||
image_position = base.allocateMemory<voxelPosition>(total, ierr);
|
||||
source_position = base.allocateMemory<voxelPosition>(total, ierr);
|
||||
davg = base.allocateMemory<float>(total, ierr);
|
||||
dstd = base.allocateMemory<float>(total, ierr);
|
||||
|
||||
base.writeData<float>(image_space, image, total);
|
||||
base.writeData<voxelPosition>(image_position, geometry, total);
|
||||
base.writeData<voxelPosition>(source_position, geometry, total);
|
||||
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
base.callCalculateSource(image_space, image_position, source_position,
|
||||
davg, dstd, DIAMETER, total, total);
|
||||
|
||||
|
||||
base.readData<float>(davg, avg, total);
|
||||
base.readData<float>(dstd, stdev, total);
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
|
||||
|
||||
base.freeMemory<float>(image_space, total);
|
||||
base.freeMemory<voxelPosition>(image_position, total);
|
||||
base.freeMemory<voxelPosition>(source_position, total);
|
||||
base.freeMemory<float>(dstd, total);
|
||||
base.freeMemory<float>(davg, total);
|
||||
|
||||
avgavg = 0;
|
||||
avgstdev = 0;
|
||||
for (int i = 0; i < total; i++) {
|
||||
avgavg += avg[i] / total;
|
||||
avgstdev += stdev[i] / total;
|
||||
}
|
||||
std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
|
||||
|
||||
return N;
|
||||
|
||||
}
|
51
test/testMIC.cpp
Normal file
51
test/testMIC.cpp
Normal file
@ -0,0 +1,51 @@
|
||||
#include <iostream>
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
|
||||
DKSBase base;
|
||||
|
||||
base.setAPI("OpenMP", 6);
|
||||
base.initDevice();
|
||||
|
||||
//init data
|
||||
int ierr;
|
||||
int N = 8;
|
||||
double *in_data = new double[N];
|
||||
double *in_data2 = new double[N];
|
||||
double *out_data = new double[N];
|
||||
double *out_data2 = new double[N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
in_data[i] = i;
|
||||
in_data2[i] = i*i;
|
||||
}
|
||||
|
||||
//test memory allocation, write and read operations
|
||||
void *d_ptr, *d2_ptr;
|
||||
|
||||
d_ptr = base.allocateMemory<double>(N, ierr);
|
||||
d2_ptr = base.allocateMemory<double>(N, ierr);
|
||||
|
||||
base.writeData<double>(d_ptr, in_data, N);
|
||||
base.writeData<double>(d2_ptr, in_data2, N);
|
||||
|
||||
base.readData<double>(d_ptr, out_data, N);
|
||||
base.readData<double>(d2_ptr, out_data2, N);
|
||||
base.freeMemory<double>(d_ptr, N);
|
||||
base.freeMemory<double>(d2_ptr, N);
|
||||
|
||||
//print results
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << out_data[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << out_data2[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
94
test/testMICOpenCL.cpp
Normal file
94
test/testMICOpenCL.cpp
Normal file
@ -0,0 +1,94 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include "DKSBase.h"
|
||||
#include "Utility/TimeStamp.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[4];
|
||||
|
||||
if (argc == 3) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
} else if (argc == 2){
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << endl;
|
||||
cout << "Use device: " << device_name << endl;
|
||||
|
||||
|
||||
int ierr;
|
||||
int N = 10000;
|
||||
double *data = new double[N];
|
||||
double *data_out = new double[N];
|
||||
double *data_out2 = new double[N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
|
||||
//init dks base class, set API to opencl and init connection with OpenCL device
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(device_name));
|
||||
base.initDevice();
|
||||
|
||||
//data ptr
|
||||
void *data_ptr, *data_ptr2;
|
||||
|
||||
//allocate memory
|
||||
data_ptr = base.allocateMemory<double>(N, ierr);
|
||||
data_ptr2 = base.allocateMemory<double>(N, ierr);
|
||||
|
||||
//write data to memory and fill data on device
|
||||
base.writeData<double>(data_ptr, data, N);
|
||||
base.writeData<double>(data_ptr2, data, N);
|
||||
//base.callNt<double>(data_ptr2, data_ptr, 6, N, 1, 0);
|
||||
|
||||
//calc sum
|
||||
base.callSum<double>(data_ptr2, data_ptr2, N);
|
||||
|
||||
//base.callSum<double>(data_ptr, data_ptr, N);
|
||||
|
||||
//chi^2
|
||||
//base.callChi2<double>(data_ptr, data_ptr, data_ptr, N);
|
||||
//base.callChi2<double>(data_ptr2, data_ptr2, data_ptr2, N);
|
||||
|
||||
//read data
|
||||
base.readData<double>(data_ptr, data_out, N);
|
||||
base.readData<double>(data_ptr2, data_out2, N);
|
||||
|
||||
//base.oclEventInfo();
|
||||
|
||||
//free memory
|
||||
base.freeMemory<double>(data_ptr, N);
|
||||
base.freeMemory<double>(data_ptr2, N);
|
||||
|
||||
|
||||
/*
|
||||
for (int i = 0; i < N; i++) {
|
||||
cout << data[i] << "\t";
|
||||
}
|
||||
cout << endl << endl;
|
||||
for (int i = 0; i < N; i++) {
|
||||
cout << data_out[i] << "\t";
|
||||
}
|
||||
cout << endl << endl;
|
||||
for (int i = 0; i < N; i++) {
|
||||
cout << data_out2[i] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
*/
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
68
test/testMICPush.cpp
Normal file
68
test/testMICPush.cpp
Normal file
@ -0,0 +1,68 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double z;
|
||||
} Part;
|
||||
|
||||
void initData(Part *data, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
data[i].x = rand() / RAND_MAX;
|
||||
data[i].y = rand() / RAND_MAX;
|
||||
data[i].z = rand() / RAND_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
int ierr;
|
||||
int N = 100000;
|
||||
|
||||
//__declspec(align(64)) Part *R = new Part[N];
|
||||
//__declspec(align(64)) Part *P = new Part[N];
|
||||
Part *R = new Part[N];
|
||||
Part *P = new Part[N];
|
||||
|
||||
initData(R, N);
|
||||
initData(P, N);
|
||||
|
||||
DKSBase dksbase;
|
||||
dksbase.setAPI("OpenMP", 6);
|
||||
dksbase.setDevice("-mic", 4);
|
||||
dksbase.initDevice();
|
||||
|
||||
void *r_ptr, *p_ptr, *dt_ptr;
|
||||
r_ptr = dksbase.allocateMemory<Part>(N, ierr);
|
||||
p_ptr = dksbase.allocateMemory<Part>(N, ierr);
|
||||
dt_ptr = dksbase.allocateMemory<double>(N, ierr);
|
||||
|
||||
dksbase.writeData<Part>(r_ptr, R, N);
|
||||
|
||||
cout << "====================START PUSH====================" << endl;
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
//write r to device
|
||||
dksbase.writeData<Part>(r_ptr, R, N);
|
||||
//calc push
|
||||
dksbase.callParallelTTrackerPush (r_ptr, p_ptr, N, dt_ptr,
|
||||
0.001, 1, false, NULL);
|
||||
//read R from device
|
||||
dksbase.readDataAsync<Part> (r_ptr, R, N, NULL);
|
||||
}
|
||||
|
||||
cout << "====================END PUSH====================" << endl;
|
||||
|
||||
|
||||
|
||||
dksbase.freeMemory<Part>(r_ptr, N);
|
||||
dksbase.freeMemory<Part>(p_ptr, N);
|
||||
dksbase.freeMemory<double>(dt_ptr, N);
|
||||
|
||||
return 0;
|
||||
}
|
89
test/testMPI.cpp
Normal file
89
test/testMPI.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(int *data, int N, int nprocs, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < nprocs; i++) {
|
||||
for (int j = 0; j < N; j++)
|
||||
cout << data[i*N + j] << "\t";
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initData(int *data, int N, int rank) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = (rank+1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
int rank, nprocs;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
|
||||
|
||||
int n = 8;
|
||||
int sizen = sizeof(int)*n;
|
||||
int sizeall = sizeof(int)*n*nprocs;
|
||||
|
||||
int *hdata_in = new int[n];
|
||||
int *hdata_out = new int[n];
|
||||
initData(hdata_in, n, rank);
|
||||
cout << "In data for process " << rank+1 << ":\t";
|
||||
printData(hdata_in, n, 1);
|
||||
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
|
||||
if (rank == 0) {
|
||||
|
||||
int *hdata_out_all = new int[nprocs*n];
|
||||
void* mem_ptr;
|
||||
mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
|
||||
|
||||
MPI_Gather(hdata_in, n, MPI_INT, mem_ptr, n, MPI_INT, 0, MPI_COMM_WORLD);
|
||||
|
||||
base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
|
||||
|
||||
MPI_Scatter(mem_ptr, n, MPI_INT, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
|
||||
|
||||
base.freeMemory<int>(mem_ptr, n*nprocs);
|
||||
|
||||
printData(hdata_out_all, n, nprocs, "Out data 1:\n");
|
||||
cout << "Scatter data for proces: " << rank + 1 << ": \t";
|
||||
printData(hdata_in, n, 1);
|
||||
} else {
|
||||
|
||||
MPI_Gather(hdata_in, n, MPI_INT, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
|
||||
|
||||
cout << "Scatter data for proces: " << rank + 1 << ": \t";
|
||||
printData(hdata_in, n, 1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
91
test/testMPIFFT.cpp
Normal file
91
test/testMPIFFT.cpp
Normal file
@ -0,0 +1,91 @@
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(complex<double> *data, int N, int nprocs, const char *message = "") {
|
||||
if (strcmp(message, "") != 0)
|
||||
cout << message;
|
||||
|
||||
for (int i = 0; i < nprocs; i++) {
|
||||
for (int j = 0; j < N; j++)
|
||||
cout << data[i*N + j] << "\t";
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initData(complex<double> *data, int N, int rank) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = complex<double>((double)rank+1.0, 0.0);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
int rank, nprocs;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
cout << "Rank " << (rank+1) << " from " << nprocs << endl;
|
||||
|
||||
int n = 8;
|
||||
|
||||
complex<double> *hdata_in = new complex<double>[n];
|
||||
complex<double> *hdata_out = new complex<double>[n];
|
||||
initData(hdata_in, n, rank);
|
||||
cout << "In data for process " << rank+1 << ":\t";
|
||||
printData(hdata_in, n, 1);
|
||||
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
|
||||
if (rank == 0) {
|
||||
|
||||
complex<double> *hdata_out_all = new complex<double>[nprocs*n];
|
||||
void* mem_ptr;
|
||||
mem_ptr = base.allocateMemory< complex<double> >(nprocs*n, ierr);
|
||||
|
||||
|
||||
MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, mem_ptr, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
|
||||
|
||||
|
||||
int dimsize[3] = {n*nprocs, 1, 1};
|
||||
base.callFFT(mem_ptr, 1, dimsize);
|
||||
base.readData< complex<double> >(mem_ptr, hdata_out_all, n*nprocs);
|
||||
|
||||
MPI_Scatter(mem_ptr, n, MPI_DOUBLE_COMPLEX, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
|
||||
|
||||
base.freeMemory< complex<double> >(mem_ptr, n*nprocs);
|
||||
|
||||
printData(hdata_out_all, n, nprocs, "Out data 1:\n");
|
||||
cout << "Scatter data for proces: " << rank + 1 << ": \t";
|
||||
printData(hdata_out, n, 1);
|
||||
} else {
|
||||
|
||||
MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
|
||||
|
||||
cout << "Scatter data for proces: " << rank + 1 << ": \t";
|
||||
printData(hdata_out, n, 1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
75
test/testMemObjects.cpp
Normal file
75
test/testMemObjects.cpp
Normal file
@ -0,0 +1,75 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr,n, N;
|
||||
|
||||
if (argc > 1)
|
||||
n = atoi(argv[1]);
|
||||
else
|
||||
n = 10;
|
||||
|
||||
N = 2 << n;
|
||||
cout << "Elements: " << N << endl;
|
||||
|
||||
double *data = new double[N];
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = (double)i / N;
|
||||
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("OpenCL", 6);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
void *ptr1;
|
||||
ptr1 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr1, data, N);
|
||||
|
||||
void *ptr2;
|
||||
ptr2 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr2, data, N);
|
||||
|
||||
void *ptr3;
|
||||
ptr3 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr3, data, N);
|
||||
|
||||
void *ptr4;
|
||||
ptr4 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr4, data, N);
|
||||
|
||||
void *ptr5;
|
||||
ptr5 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr5, data, N);
|
||||
|
||||
void *ptr6;
|
||||
ptr6 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr6, data, N);
|
||||
|
||||
void *ptr7;
|
||||
ptr7 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr7, data, N);
|
||||
|
||||
void *ptr8;
|
||||
ptr8 = base.allocateMemory<double>(N, ierr);
|
||||
ierr = base.writeData<double>(ptr8, data, N);
|
||||
|
||||
base.freeMemory<double>(ptr1, N);
|
||||
base.freeMemory<double>(ptr2, N);
|
||||
base.freeMemory<double>(ptr3, N);
|
||||
base.freeMemory<double>(ptr4, N);
|
||||
base.freeMemory<double>(ptr5, N);
|
||||
base.freeMemory<double>(ptr6, N);
|
||||
base.freeMemory<double>(ptr7, N);
|
||||
base.freeMemory<double>(ptr8, N);
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
73
test/testOffset.cpp
Normal file
73
test/testOffset.cpp
Normal file
@ -0,0 +1,73 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
|
||||
int ierr,n, N;
|
||||
|
||||
N = 8;
|
||||
n = 4;
|
||||
|
||||
double *data_in = new double[N];
|
||||
double *data_out_1 = new double[N];
|
||||
double *data_out_2 = new double[N];
|
||||
for (int i = 0; i < N; i++) {
|
||||
data_in[i] = (double)i / N;
|
||||
data_out_1[i] = 0.0;
|
||||
data_out_2[i] = 0.0;
|
||||
}
|
||||
|
||||
cout << "Run example on: " << api_name << " using " << device_name << endl;
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
void *ptr1;
|
||||
ptr1 = base.allocateMemory<double>(N, ierr);
|
||||
|
||||
ierr = base.writeData<double>(ptr1, data_in, n, 0);
|
||||
ierr = base.writeData<double>(ptr1, data_in, n, 4);
|
||||
|
||||
ierr = base.readData<double>(ptr1, data_out_1, N);
|
||||
ierr = base.readData<double>(ptr1, data_out_2, n, 2);
|
||||
|
||||
base.freeMemory<double>(ptr1, N);
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << data_in[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << data_out_1[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << data_out_2[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
81
test/testOffsetMPI.cpp
Normal file
81
test/testOffsetMPI.cpp
Normal file
@ -0,0 +1,81 @@
|
||||
#include <mpi.h>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int rank, size;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
|
||||
cout << "Rank " << rank << " from " << size << endl;
|
||||
|
||||
|
||||
int ierr, N, n;
|
||||
|
||||
N = 8;
|
||||
n = N / 2;
|
||||
|
||||
double *data_in = new double[n];
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
data_in[i] = (double)rank + 1.0 + (double)i / n;
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
if (rank == 0) {
|
||||
//alocate memory of size N
|
||||
void *ptr1;
|
||||
ptr1 = base.allocateMemory<double>(size*N, ierr);
|
||||
cout << "Sent pointer: " << ptr1 << endl;
|
||||
|
||||
//send ptr to other processes
|
||||
MPI_Send(&ptr1, sizeof(void*), MPI_BYTE, 1, 123, MPI_COMM_WORLD);
|
||||
|
||||
//wrtie n data with no offset to device and wait for other processes
|
||||
ierr = base.writeData<double>(ptr1, data_in, n, rank*n);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
//read memory of size N from device
|
||||
double *data_out = new double[N];
|
||||
ierr = base.readData<double>(ptr1, data_out, N);
|
||||
|
||||
//free device memory
|
||||
base.freeMemory<double>(ptr1, size*N);
|
||||
|
||||
//print results
|
||||
for (int i = 0; i < n; i++)
|
||||
cout << data_in[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << data_out[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
} else {
|
||||
//receive device memory pointer
|
||||
void *ptr2;
|
||||
MPI_Recv(&ptr2, sizeof(void*), MPI_BYTE, 0, 123, MPI_COMM_WORLD, NULL);
|
||||
cout << "Received pointer: " << ptr2 << endl;
|
||||
//write data with an offset
|
||||
base.writeData<double>(ptr2, data_in, n, rank*n);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
57
test/testPush.cpp
Normal file
57
test/testPush.cpp
Normal file
@ -0,0 +1,57 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include <vector_types.h>
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void initData(double3 *data, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
data[i].x = rand() / RAND_MAX;
|
||||
data[i].y = rand() / RAND_MAX;
|
||||
data[i].z = rand() / RAND_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
|
||||
int ierr;
|
||||
int N = 1000000;
|
||||
double3 *R = new double3[N];
|
||||
double3 *P = new double3[N];
|
||||
|
||||
initData(R, N);
|
||||
initData(P, N);
|
||||
|
||||
DKSBase dksbase;
|
||||
dksbase.setAPI("Cuda", 4);
|
||||
dksbase.setDevice("-gpu", 4);
|
||||
dksbase.initDevice();
|
||||
|
||||
void *r_ptr, *p_ptr;
|
||||
|
||||
r_ptr = dksbase.allocateMemory<double3>(N, ierr);
|
||||
p_ptr = dksbase.allocateMemory<double3>(N, ierr);
|
||||
|
||||
dksbase.writeData<double3>(r_ptr, R, N);
|
||||
dksbase.writeData<double3>(p_ptr, P, N);
|
||||
|
||||
for (int i = 0; i < 100; i++)
|
||||
dksbase.callParallelTTrackerPush(r_ptr, p_ptr, N, NULL, 0.5, 1, false);
|
||||
|
||||
|
||||
dksbase.readData<double3>(r_ptr, R, N);
|
||||
dksbase.readData<double3>(p_ptr, P, N);
|
||||
|
||||
dksbase.freeMemory<double3>(r_ptr, N);
|
||||
dksbase.freeMemory<double3>(p_ptr, N);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
168
test/testRCFFT.cpp
Normal file
168
test/testRCFFT.cpp
Normal file
@ -0,0 +1,168 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(double* &data, int N1, int N2);
|
||||
void printData(complex<double>* &data, int N1, int N2);
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
void printData3DN4(double* &data, int N, int dim);
|
||||
|
||||
|
||||
void compareData(double* &data1, double* &data2, int N, int dim);
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N1 = 4;
|
||||
int N2 = 4;
|
||||
|
||||
if (argc == 3) {
|
||||
N1 = atoi(argv[1]);
|
||||
N2 = atoi(argv[2]);
|
||||
}
|
||||
|
||||
int dimsize[3] = {N1, N2, 1};
|
||||
|
||||
cout << "Begin RC 3D FFT tests, grid = " << N1 << "\t" << N2 << endl;
|
||||
int sizereal = N1*N2;
|
||||
int sizecomp = N1*(N2/2+1);
|
||||
|
||||
int dim = 3;
|
||||
double *cdata = new double[sizereal];
|
||||
complex<double> *cfft = new complex<double>[sizecomp];
|
||||
|
||||
for (int i = 0; i < N2; i++) {
|
||||
for (int j = 0; j < N1; j++) {
|
||||
cdata[i*N1 + j] = (double)(j) / N1;
|
||||
}
|
||||
}
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
DKSBase base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
void *real_ptr, *comp_ptr;
|
||||
int ierr;
|
||||
/* allocate memory on device */
|
||||
real_ptr = base.allocateMemory<double>(sizereal, ierr);
|
||||
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
|
||||
|
||||
/* write data to device */
|
||||
ierr = base.writeData<double>(real_ptr, cdata, sizereal);
|
||||
|
||||
/* execute fft */
|
||||
base.callR2CFFT(real_ptr, comp_ptr, 2, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory<double>(real_ptr, sizereal);
|
||||
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
|
||||
|
||||
cout << "FFT complete" << endl;
|
||||
|
||||
|
||||
/* print results */
|
||||
printData(cdata, N1, N2);
|
||||
printData(cfft, N1, N2);
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData(double* &data, int N1, int N2) {
|
||||
|
||||
for (int i = 0; i < N2; i++) {
|
||||
for (int j = 0; j < N1; j++) {
|
||||
cout << data[i*N1 + j] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void printData(complex<double>* &data, int N1, int N2) {
|
||||
|
||||
complex<double> tmp(0.0, 0.0);
|
||||
for (int i = 0; i < N2/2+1; i++) {
|
||||
for (int j = 0; j < N1; j++) {
|
||||
tmp = data[i*N1 + j];
|
||||
if (tmp.real() < 0.00001 && tmp.real() > -0.00001) tmp = complex<double>(0.0, tmp.imag());
|
||||
if (tmp.imag() < 0.00001 && tmp.imag() > -0.00001) tmp = complex<double>(tmp.real(), 0.0);
|
||||
|
||||
cout << tmp << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
if (a < 10e-5 && a > -10e-5)
|
||||
a = 0;
|
||||
|
||||
cout << d << "; " << a << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void printData3DN4(double* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k];
|
||||
if (d > 10e-5 || d < -10e-5)
|
||||
cout << d << "\t";
|
||||
else
|
||||
cout << 0 << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void compareData(double* &data1, double* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id] - data2[id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
181
test/testStockFFT3D.cpp
Normal file
181
test/testStockFFT3D.cpp
Normal file
@ -0,0 +1,181 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int n = 2;
|
||||
if (argc == 2)
|
||||
n = atoi(argv[1]);
|
||||
|
||||
int N = pow(2,n);
|
||||
|
||||
cout << "Begin DKS Base tests" << endl;
|
||||
|
||||
cout << "FFT size: " << N << endl;
|
||||
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
|
||||
complex<double> *cdata = new complex<double>[N*N*N];
|
||||
complex<double> *cfft = new complex<double>[N*N*N];
|
||||
complex<double> *cfft2 = new complex<double>[N*N*N];
|
||||
complex<double> *cfft3 = new complex<double>[N*N*N];
|
||||
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
//cdata[i*N*N + j*N + k] = complex<double>((double)k/(N*N*N), 0);
|
||||
cdata[i*N*N + j*N + k] = complex<double>(k, 0);
|
||||
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cfft2[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cfft3[i*N + j*N + k] = complex<double>(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (N == 4)
|
||||
printData3DN4(cdata, N, 3);
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
int ierr;
|
||||
|
||||
|
||||
timestamp_t t0, t1;
|
||||
|
||||
/* stockham radix-2 out-of-place fft */
|
||||
DKSBase base2;
|
||||
base2.setAPI("OpenCL", 6);
|
||||
base2.setDevice("-gpu", 4);
|
||||
base2.initDevice();
|
||||
|
||||
cout << endl;
|
||||
void *src_ptr;
|
||||
for (int i = 0; i < 5; i++) {
|
||||
t0 = get_timestamp();
|
||||
src_ptr = base2.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
base2.writeData< complex<double> >(src_ptr, cdata, N*N*N);
|
||||
base2.callFFTStockham(src_ptr, 3, dimsize);
|
||||
base2.readData< complex<double> >(src_ptr, cfft2, N*N*N);
|
||||
base2.freeMemory< complex<double> >(src_ptr, N*N*N);
|
||||
t1 = get_timestamp();
|
||||
cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
|
||||
}
|
||||
|
||||
if (N == 4)
|
||||
printData3DN4(cfft2, N, 3);
|
||||
|
||||
//delete base2;
|
||||
cout << endl;
|
||||
|
||||
/* CUDA cufft */
|
||||
DKSBase base3;
|
||||
base3.setAPI("Cuda", 4);
|
||||
base3.setDevice("-gpu", 4);
|
||||
base3.initDevice();
|
||||
|
||||
cout << endl;
|
||||
void *cuda_ptr;
|
||||
for (int i = 0; i < 5; i++) {
|
||||
t0 = get_timestamp();
|
||||
cuda_ptr = base3.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
base3.writeData< complex<double> >(cuda_ptr, cdata, N*N*N);
|
||||
base3.callFFT(cuda_ptr, 3, dimsize);
|
||||
base3.readData< complex<double> >(cuda_ptr, cfft3, N*N*N);
|
||||
base3.freeMemory< complex<double> >(cuda_ptr, N*N*N);
|
||||
t1 = get_timestamp();
|
||||
cout << "Cuda FFT time: " << get_secs(t0, t1) << endl;
|
||||
}
|
||||
|
||||
if (N == 4)
|
||||
printData3DN4(cfft3, N, 3);
|
||||
|
||||
//delete base3;
|
||||
cout << endl;
|
||||
|
||||
|
||||
/* radix-2 in place fft */
|
||||
DKSBase base;
|
||||
base.setAPI("OpenCL", 6);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
cout << endl;
|
||||
void *mem_ptr;
|
||||
for (int i = 0; i < 5; i++) {
|
||||
t0 = get_timestamp();
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
base.readData< complex<double> >(mem_ptr, cfft, N*N*N);
|
||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||
t1 = get_timestamp();
|
||||
cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
|
||||
}
|
||||
|
||||
if (N == 4)
|
||||
printData3DN4(cfft, N, 3);
|
||||
|
||||
//delete base;
|
||||
cout << endl;
|
||||
|
||||
/* compare results */
|
||||
cout << endl;
|
||||
|
||||
cout << "Radix 2 vs Stockham: ";
|
||||
compareData(cfft, cfft2, N, 3);
|
||||
|
||||
cout << "Radix 2 vs Cufft: ";
|
||||
compareData(cfft, cfft3, N, 3);
|
||||
|
||||
cout << "Stockham vs Cufft: ";
|
||||
compareData(cfft2, cfft3, N, 3);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
if (d > 10e-5 || d < -10e-5)
|
||||
cout << d << "\t";
|
||||
else
|
||||
cout << 0 << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "CC <--> CC diff: " << sum << endl;
|
||||
}
|
107
test/testStockhamFFT.cpp
Normal file
107
test/testStockhamFFT.cpp
Normal file
@ -0,0 +1,107 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int n = 2;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
} else if (argc == 4) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
n = atoi(argv[3]);
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
int N = pow(2,n);
|
||||
cout << "Use api: " << api_name << endl;
|
||||
|
||||
cout << "Begin DKS Base tests" << endl;
|
||||
|
||||
cout << "FFT size: " << N << endl;
|
||||
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
complex<double> *cdata = new complex<double>[N];
|
||||
complex<double> *cfft = new complex<double>[N];
|
||||
complex<double> *cfft2 = new complex<double>[N];
|
||||
complex<double> *cfftsrc = new complex<double>[N];
|
||||
for (int i = 0; i < N; i++) {
|
||||
cdata[i] = complex<double>((double)i / N, 0);
|
||||
cfft[i] = complex<double>(0, 0);
|
||||
cfft2[i] = complex<double>(0, 0);
|
||||
cfftsrc[i] = complex<double>(0, 0);
|
||||
}
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
|
||||
timestamp_t t0, t1;
|
||||
|
||||
/* radix-2 in place fft */
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
t0 = get_timestamp();
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N, ierr);
|
||||
base.writeData< complex<double> >(mem_ptr, cdata, N);
|
||||
base.callFFT(mem_ptr, 1, dimsize);
|
||||
base.readData< complex<double> >(mem_ptr, cfft, N);
|
||||
base.freeMemory< complex<double> >(mem_ptr, N);
|
||||
t1 = get_timestamp();
|
||||
cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
|
||||
}
|
||||
|
||||
cout << endl;
|
||||
|
||||
/* stockham radix-2 out-of-place fft */
|
||||
void *src_ptr;
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
t0 = get_timestamp();
|
||||
src_ptr = base.allocateMemory< complex<double> >(N, ierr);
|
||||
base.writeData< complex<double> >(src_ptr, cdata, N);
|
||||
base.callFFTStockham(src_ptr, 1, dimsize);
|
||||
base.readData< complex<double> >(src_ptr, cfft2, N);
|
||||
base.freeMemory< complex<double> >(src_ptr, N);
|
||||
t1 = get_timestamp();
|
||||
cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
|
||||
}
|
||||
|
||||
double diff = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
diff += fabs(cfft[i].real() - cfft2[i].real());
|
||||
diff += fabs(cfft[i].imag() - cfft2[i].imag());
|
||||
}
|
||||
|
||||
cout << endl << "Difference: " << diff << endl;
|
||||
|
||||
if (diff > 0.00001) {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
cout << cfft[i] << "\t" << cfft2[i] << endl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
227
test/testTimeIntegration.cpp
Normal file
227
test/testTimeIntegration.cpp
Normal file
@ -0,0 +1,227 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include <vector_types.h>
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double z;
|
||||
} Vector;
|
||||
|
||||
Vector initVector() {
|
||||
Vector tmp;
|
||||
tmp.x = 0.5;
|
||||
tmp.y = 0.5;
|
||||
tmp.z = 0.5;
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
void initVectors(Vector *v, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
v[i] = initVector();
|
||||
}
|
||||
|
||||
void initDouble(double *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = 0.005;
|
||||
}
|
||||
|
||||
void initLastSect(long *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = -1;
|
||||
}
|
||||
|
||||
void checkSum(Vector *v, int N) {
|
||||
double sum = 0;
|
||||
for (int i = 0; i < N; i++)
|
||||
sum += v[i].x + v[i].y + v[i].z;
|
||||
|
||||
std::cout << "checksum: " << sum << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int loop = 10;
|
||||
int numpart = 10;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenMP");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-npart")) {
|
||||
numpart = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (argv[i] == string("-loop")) {
|
||||
loop = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cout << "=========================BEGIN TEST=========================" << endl;
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
cout << "Number of particles: " << numpart << endl;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
|
||||
//init p,r and dt arrays to test time integration
|
||||
Vector *r = new Vector[numpart];
|
||||
Vector *p = new Vector[numpart];
|
||||
Vector *x = new Vector[numpart];
|
||||
Vector *ori = new Vector[5];
|
||||
initVectors(r, numpart);
|
||||
initVectors(p, numpart);
|
||||
initVectors(x, numpart);
|
||||
initVectors(ori, 5);
|
||||
|
||||
double *dt = new double[numpart];
|
||||
initDouble(dt, numpart);
|
||||
|
||||
long *ls = new long[numpart];
|
||||
initLastSect(ls, numpart);
|
||||
|
||||
//init dks
|
||||
int ierr;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
int stream1, stream2;
|
||||
base.createStream(stream1);
|
||||
base.createStream(stream2);
|
||||
|
||||
base.registerHostMemory(r, numpart);
|
||||
base.registerHostMemory(p, numpart);
|
||||
base.registerHostMemory(x, numpart);
|
||||
base.registerHostMemory(dt, numpart);
|
||||
base.registerHostMemory(ls, numpart);
|
||||
|
||||
//***test parallelttrackerpush***//
|
||||
void *r_ptr, *p_ptr, *x_ptr, *dt_ptr, *ls_ptr, *ori_ptr;
|
||||
|
||||
//allocate memory on the device
|
||||
r_ptr = base.allocateMemory<Vector>(numpart, ierr);
|
||||
p_ptr = base.allocateMemory<Vector>(numpart, ierr);
|
||||
x_ptr = base.allocateMemory<Vector>(numpart, ierr);
|
||||
dt_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
ls_ptr = base.allocateMemory<long>(numpart, ierr);
|
||||
ori_ptr = base.allocateMemory<Vector>(5, ierr);
|
||||
|
||||
//transfer data to device
|
||||
base.writeData<Vector>(r_ptr, r, numpart);
|
||||
base.writeData<Vector>(p_ptr, p, numpart);
|
||||
base.writeData<Vector>(x_ptr, x, numpart);
|
||||
base.writeData<Vector>(ori_ptr, ori, 5);
|
||||
|
||||
|
||||
//do some couple of integration loops before the timer is started
|
||||
for (int i = 0; i < 5; i++) {
|
||||
//calc push
|
||||
base.callParallelTTrackerPush (r_ptr, p_ptr, numpart, dt_ptr,
|
||||
0.05, 1, false, stream1);
|
||||
|
||||
//read R from device
|
||||
base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
|
||||
|
||||
//write LastSection to device
|
||||
base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
|
||||
|
||||
//calc push
|
||||
base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
|
||||
dt_ptr, 0.05, 1, false, stream2);
|
||||
//read x from device
|
||||
base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
|
||||
|
||||
//sync and wait till all tasks and reads are complete
|
||||
base.syncDevice();
|
||||
}
|
||||
|
||||
checkSum(r, numpart);
|
||||
checkSum(x, numpart);
|
||||
|
||||
|
||||
|
||||
//start the timing of integration
|
||||
struct timeval timeStart, timeEnd;
|
||||
std::cout << "start integration" << std::endl;
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i = 0; i < loop; i++) {
|
||||
|
||||
//calc push
|
||||
base.callParallelTTrackerPush(r_ptr, p_ptr, numpart, dt_ptr, 0.05, 1, false, stream1);
|
||||
|
||||
//read R from device
|
||||
base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
|
||||
|
||||
//write LastSection to device
|
||||
base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
|
||||
|
||||
//calc push transform
|
||||
base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
|
||||
dt_ptr, 0.05, 1, false, stream2);
|
||||
|
||||
//read R from device
|
||||
base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
|
||||
|
||||
//sync and wait till all tasks and reads are complete
|
||||
base.syncDevice();
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
|
||||
std::cout << "end integration" << std::endl;
|
||||
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec));
|
||||
|
||||
std::cout << "Time for " << numpart << " integrations: " << t * 1e-6 << "s" << std::endl;
|
||||
std::cout << "Average time for integration: " << t * 1e-6 / loop << std::endl;
|
||||
|
||||
checkSum(r, numpart);
|
||||
checkSum(x, numpart);
|
||||
|
||||
|
||||
|
||||
//free memory
|
||||
base.freeMemory<Vector>(r_ptr, numpart);
|
||||
base.freeMemory<Vector>(p_ptr, numpart);
|
||||
base.freeMemory<Vector>(x_ptr, numpart);
|
||||
base.freeMemory<Vector>(ori_ptr, 5);
|
||||
base.freeMemory<double>(dt_ptr, numpart);
|
||||
base.freeMemory<long>(ls_ptr, numpart);
|
||||
|
||||
//unregister host memory
|
||||
base.unregisterHostMemory(r);
|
||||
base.unregisterHostMemory(p);
|
||||
base.unregisterHostMemory(x);
|
||||
base.unregisterHostMemory(dt);
|
||||
base.unregisterHostMemory(ls);
|
||||
|
||||
//free host memory
|
||||
delete[] r;
|
||||
delete[] x;
|
||||
delete[] p;
|
||||
delete[] dt;
|
||||
delete[] ls;
|
||||
delete[] ori;
|
||||
|
||||
cout << "==========================END TEST==========================" << endl;
|
||||
return 0;
|
||||
|
||||
}
|
76
test/testTranspose.cpp
Normal file
76
test/testTranspose.cpp
Normal file
@ -0,0 +1,76 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void initData(complex<double> *d, int N, int dim) {
|
||||
|
||||
int size = N;
|
||||
if (dim == 2) size = N*N;
|
||||
if (dim == 3) size = N*N*N;
|
||||
|
||||
for (int i = 0; i < size; i++)
|
||||
d[i] = complex<double>(i, 0);
|
||||
|
||||
}
|
||||
|
||||
void printData(complex<double> *d, int N, int dim) {
|
||||
|
||||
int NZ = N;
|
||||
int NY = (dim > 1) ? N : 1;
|
||||
int NX = (dim > 2) ? N : 1;
|
||||
|
||||
for (int i = 0; i < NX; i++) {
|
||||
for (int j = 0; j < NY; j++) {
|
||||
for (int k = 0; k < NZ; k++) {
|
||||
std::cout << d[i*N*N + j*N + k].real() << "\t";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = (argc > 1) ? atoi(argv[1]) : 4;
|
||||
int dimN[3] = {N, N, 1};
|
||||
int dim = 2;
|
||||
int ndim = 1;
|
||||
int size = dimN[0] * dimN[1] * dimN[2];
|
||||
|
||||
std::complex<double> *hd_in = new std::complex<double>[size];
|
||||
std::complex<double> *hd_out = new std::complex<double>[size];
|
||||
initData(hd_in, N, dim);
|
||||
printData(hd_in, N, dim);
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI("OpenCL", 6);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
|
||||
int ierr;
|
||||
void *mem_ptr;
|
||||
|
||||
mem_ptr = base.allocateMemory< std::complex<double> >(size, ierr);
|
||||
base.writeData< std::complex<double> >(mem_ptr, hd_in, size);
|
||||
|
||||
base.callTranspose(mem_ptr, dimN, dim, ndim);
|
||||
|
||||
base.readData< std::complex<double> >(mem_ptr, hd_out, size);
|
||||
base.freeMemory< std::complex<double> >(mem_ptr, size);
|
||||
|
||||
printData(hd_out, N, 2);
|
||||
|
||||
delete[] hd_in;
|
||||
delete[] hd_out;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
Reference in New Issue
Block a user