OpenCL FFT using clfft and tests

This commit is contained in:
Uldis Locans
2016-11-21 16:14:11 +01:00
parent 4432d32480
commit b3a12e02a8
14 changed files with 482 additions and 390 deletions

View File

@ -1,5 +1,4 @@
#include <iostream>
//#include <mpi.h>
#include <string.h>
#include "DKSBase.h"
@ -11,309 +10,265 @@ using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1;
}
}
}
for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1;
}
}
}
}
void initData2(double *data, int N) {
for (int i = 0; i < N; i++)
data[i] = i;
for (int i = 0; i < N; i++)
data[i] = i;
}
void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0);
}
for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0);
}
}
void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << "\t";
cout << endl;
for (int i = 0; i < N; i++)
cout << d[i] << "\t";
cout << endl;
}
void printDouble(double *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << ", ";
cout << endl;
}
void initMirror(double *data, int n1, int n2, int n3) {
int d = 1;
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++;
else
data[i * n2 * n1 + j * n1 + k] = 0;
}
}
}
int d = 1;
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++;
else
data[i * n2 * n1 + j * n1 + k] = 0;
}
}
}
}
void printDiv(int c) {
for (int i = 0; i < c; i++)
cout << "-";
cout << endl;
for (int i = 0; i < c; i++)
cout << "-";
cout << endl;
}
void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75);
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
}
cout << endl;
}
cout << endl;
}
cout << endl;
printDiv(75);
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
}
cout << endl;
}
cout << endl;
}
cout << endl;
}
double sumData(double *data, int datasize) {
double sum = 0;
for (int i = 0; i < datasize; i++)
sum += data[i];
double sum = 0;
for (int i = 0; i < datasize; i++)
sum += data[i];
return sum;
return sum;
}
int main(int argc, char *argv[]) {
/* mpi init */
//int rank, nprocs;
//MPI_Init(&argc, &argv);
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
char *api_name = new char[10];
char *device_name = new char[10];
/*
if (nprocs != 8) {
cout << "example was set to run with 8 processes" << endl;
cout << "exit..." << endl;
return 0;
}
*/
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
/* set domain size */
int NG[3] = {64, 64, 32};
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
int id[3];
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
//id[0] = 0;
//id[1] = NL[1] * (rank % 4);
//id[2] = NL[2] * (rank / 4);
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
/* print some messages bout the example in the begginig */
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
int tmp[3];
/* for (int p = 1; p < nprocs; p++) {
MPI_Status mpistatus;
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
}*/
// } else {
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
// }
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
}
/* dks init and create 2 streams */
int dkserr;
//int streamGreens, streamFFT;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
#endif
cout << "Use api: " << api_name << ", " << device_name << endl;
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
#endif
/* set domain size */
int NG[3] = {64, 64, 32};
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
//base.createStream(streamFFT);
//if (rank == 0) {
// base.createStream(streamGreens);
base.setupFFT(3, NG);
//}
/* print some messages bout the example in the begginig */
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
/* allocate memory and init rho field */
double *rho = new double[sizerho];
double *rho_out = new double[sizerho];
//double *green_out = new double[sizegreen];
initMirror(rho, NL[0], NL[1], NL[2]);
/* dks init and create 2 streams */
int dkserr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, NG);
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
// if (rank == 0) {
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* } else {
grntr_ptr = NULL;
rho2_ptr = NULL;
grn_ptr = NULL;
rho2tr_ptr = NULL;
tmpgreen_ptr = NULL;
}*/
/* allocate memory and init rho field */
double *rho = new double[sizerho];
double *rho_out = new double[sizerho];
//double *green_out = new double[sizegreen];
double *mirror_out = new double[sizerho];
//initMirror(rho, NL[0], NL[1], NL[2]);
initMirror(rho, NG[0], NG[1], NG[2]);
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* send and receive pointer to allocated memory on device */
/*
if (rank == 0) {
for (int p = 1; p < nprocs; p++)
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
} else {
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
}
MPI_Barrier(MPI_COMM_WORLD);
*/
/* =================================================*/
/* =================================================*/
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
double old_sum = 0;
/* =================================================*/
/* =================================================*/
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
int hr_m[3] = {1, 1, 1};
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
double old_sum = 0;
double tmp_sum = 0;
for (int l = 0; l < 100; l++) {
//MPI_Barrier(MPI_COMM_WORLD);
/* on node 0, calculate tmpgreen on gpu */
int hr_m[3] = {1, 1, 1};
//if (rank == 0)
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
hr_m[0], hr_m[1], hr_m[2]);
/* calculate greens integral on gpu */
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
/* calculate greens integral on gpu */
//if (rank == 0)
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
/* mirror the field */
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
/*
base.readData<double>(grn_ptr, mirror_out, sizerho);
for (int i = 0; i < sizerho; i++)
cout << mirror_out[i] << " ";
cout << endl << endl;
/* mirror the field */
//if (rank == 0)
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
for (int i = 0; i < sizerho; i++)
cout << rho[i] << " ";
cout << endl << endl;
*/
/* transfer rho field to device */
base.writeData<double>(rho2_ptr, rho, sizerho);
/* get FFT of rho field */
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
/* get FFT of mirrored greens integral */
//if (rank == 0)
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* get FFT of mirrored greens integral */
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* transfer rho field to device */
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
/* multiply both FFTs */
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
/* get FFT of rho field */
//if (rank == 0) {
//base.syncDevice();
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
//}
/*
complex<double> *crho = new complex<double>[sizecomp];
complex<double> *cgre = new complex<double>[sizecomp];
base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
/* multiply both FFTs */
//if (rank == 0)
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < sizecomp; i++)
cout << cgre[i].real() << " ";
cout << endl << endl;
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
//if (rank == 0) {
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//cout << "result: " << sumData(rho_out, sizerho) << endl;
if (l == 0) {
old_sum = sumData(rho_out, sizerho);
} else {
tmp_sum = sumData(rho_out, sizerho);
if (old_sum != tmp_sum) {
cout << "diff in iteration: " << l << endl;
}
}
/*} else {
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
*/
for (int i = 0; i < sizecomp; i++)
cout << crho[i].real() << " ";
cout << endl << endl;
delete[] crho;
delete[] cgre;
*/
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
base.readData<double> (rho2_ptr, rho_out, sizerho);
for (int i = 0; i < 10; i++)
cout << rho_out[i] << " ";
cout << endl;
old_sum = sumData(rho_out, sizerho);
}
/* =================================================*/
/* =================================================*/
/* ==========end fftpoison solver test run==========*/
/* =================================================*/
/* =================================================*/
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
base.freeMemory<double>(rho2_ptr, sizerho);
/* free memory on device */
//if (rank == 0) {
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(rho2_ptr, sizerho);
cout << "Final sum: " << old_sum << endl;
/*} else {
base.closeHandle(rho2_ptr);
MPI_Barrier(MPI_COMM_WORLD);
}*/
//MPI_Finalize();
delete[] rho_out;
delete[] rho;
delete[] mirror_out;
cout << "Final sum: " << old_sum << endl;
}