Marginal updates to the parallelized parts of libFitPofB

2010-10-05 07:41:21 +00:00
parent 6a9d0e0d69
commit ce50cd396e
6 changed files with 244 additions and 29 deletions
--- a/configure.ac
+++ b/configure.ac
@ -209,18 +209,6 @@ if test "${FFTW3_FOUND}" != "1"; then
  FFTW3_LIBS="-L${FFTW3_PREFIX}/lib -lfftw3 -lm"
  FFTW3_CFLAGS="-I${FFTW3_PREFIX}/include"
 fi
-# Check for fftw3_threads-library. If available musrfit is also linked against it (used in libTFitPofB).
-SAVED_CFLAGS="$CFLAGS"
-CFLAGS="$CFLAGS $FFTW3_CFLAGS"
-SAVED_LIBSS="$LIBS"
-LIBS="$LIBS $FFTW3_LIBS"
-AC_SEARCH_LIBS([fftw_init_threads], [fftw3_threads], [FFTW3_LIBS="$FFTW3_LIBS -lfftw3_threads -lpthread"
-                                                      AC_DEFINE([HAVE_LIBFFTW3_THREADS], [1], [Define to 1 if fftw3_threads are available])], [], [-lpthread])
-CFLAGS="$SAVED_CFLAGS"
-LIBS="$SAVED_LIBS"
-AC_SUBST(FFTW3_LIBS)
-AC_SUBST(FFTW3_CFLAGS)
-

 dnl -----------------------------------------------
 dnl Ask user for path to gsl
@ -487,7 +475,28 @@ if test "${BUILD_BMW_LIBS}" = "1"; then
  TFITPOFB_CFLAGS="-I${TFITPOFB_SRCDIR}/include"
  AC_SUBST(TFITPOFB_LIBS)
  AC_SUBST(TFITPOFB_CFLAGS)
+
+  # Check for fftw3_threads-library. If available musrfit is also linked against it (used in libTFitPofB).
+  SAVED_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS $FFTW3_CFLAGS"
+  SAVED_LIBSS="$LIBS"
+  LIBS="$LIBS $FFTW3_LIBS"
+  AC_SEARCH_LIBS([fftw_init_threads], [fftw3_threads], [FFTW3_LIBS="$FFTW3_LIBS -lfftw3_threads -lpthread"
+                                      AC_DEFINE([HAVE_LIBFFTW3_THREADS], [1], [Define to 1 if fftw3_threads are available])], [], [-lpthread])
+  CFLAGS="$SAVED_CFLAGS"
+  LIBS="$SAVED_LIBS"
+
+  # Check for gomp-library. If available musrfit is also linked against it (used in libTFitPofB).
+  SAVED_CXXFLAGS="$CXXFLAGS"
+  CXXFLAGS="$CXXFLAGS -fopenmp"
+  SAVED_LIBSS="$LIBS"
+  LIBS="$LIBS -fopenmp"
+  AC_SEARCH_LIBS([omp_get_num_procs], [gomp], [AC_DEFINE([HAVE_GOMP], [1], [Define to 1 if gomp is available])],
+                                      [CXXFLAGS="$SAVED_CXXFLAGS" LIBS="$SAVED_LIBS"], [])
 fi
+AC_SUBST(FFTW3_LIBS)
+AC_SUBST(FFTW3_CFLAGS)
+

 dnl -----------------------------------------------
 dnl Set host specific compiler and linker flags
--- a/src/external/TFitPofB-lib/classes/TBofZCalc.cpp
+++ b/src/external/TFitPofB-lib/classes/TBofZCalc.cpp
@ -50,7 +50,9 @@ void TBofZCalc::Calculate()
  fZ.resize(fSteps);
  fBZ.resize(fSteps);

+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(j,ZZ) schedule(dynamic)
+#endif
  for (j=0; j<fSteps; j++) {
    ZZ = fParam[1] + static_cast<double>(j)*fDZ;
    fZ[j] = ZZ;
--- a/src/external/TFitPofB-lib/classes/TBulkTriVortexFieldCalc.cpp
+++ b/src/external/TFitPofB-lib/classes/TBulkTriVortexFieldCalc.cpp
@ -130,8 +130,13 @@ TBulkTriVortexLondonFieldCalc::TBulkTriVortexLondonFieldCalc(const string& wisdo

 #ifdef HAVE_LIBFFTW3_THREADS
  int init_threads(fftw_init_threads());
-  if (init_threads)
+  if (init_threads) {
+#ifdef HAVE_GOMP
+    fftw_plan_with_nthreads(omp_get_num_procs());
+#else
    fftw_plan_with_nthreads(2);
+#endif /* HAVE_GOMP */
+  }
 #endif /* HAVE_LIBFFTW3_THREADS */

  fFFTin = new fftw_complex[(fSteps/2 + 1) * fSteps];
@ -191,7 +196,9 @@ void TBulkTriVortexLondonFieldCalc::CalculateGrid() const {
  // ... but first check that the field is not larger than Hc2 and that we are dealing with a type II SC
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    #endif
    for (m = 0; m < NFFTsq; m++) {
      fFFTout[m] = field;
    }
@ -274,7 +281,9 @@ void TBulkTriVortexLondonFieldCalc::CalculateGrid() const {
  fftw_execute(fFFTplan);

  // Multiply by the applied field
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    fFFTout[l] *= field;
  }
@ -298,8 +307,13 @@ TBulkSqVortexLondonFieldCalc::TBulkSqVortexLondonFieldCalc(const string& wisdom,

 #ifdef HAVE_LIBFFTW3_THREADS
  int init_threads(fftw_init_threads());
-  if (init_threads)
+  if (init_threads) {
+#ifdef HAVE_GOMP
+    fftw_plan_with_nthreads(omp_get_num_procs());
+#else
    fftw_plan_with_nthreads(2);
+#endif /* HAVE_GOMP */
+  }
 #endif /* HAVE_LIBFFTW3_THREADS */

  fFFTin = new fftw_complex[(fSteps/2 + 1) * fSteps];
@ -359,7 +373,9 @@ void TBulkSqVortexLondonFieldCalc::CalculateGrid() const {
  // ... but first check that the field is not larger than Hc2 and that we are dealing with a type II SC
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    #endif
    for (m = 0; m < NFFTsq; m++) {
      fFFTout[m] = field;
    }
@ -397,7 +413,9 @@ void TBulkSqVortexLondonFieldCalc::CalculateGrid() const {
  fftw_execute(fFFTplan);

  // Multiply by the applied field
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    fFFTout[l] *= field;
  }
@ -423,8 +441,13 @@ TBulkTriVortexMLFieldCalc::TBulkTriVortexMLFieldCalc(const string& wisdom, const

 #ifdef HAVE_LIBFFTW3_THREADS
  int init_threads(fftw_init_threads());
-  if (init_threads)
+  if (init_threads) {
+#ifdef HAVE_GOMP
+    fftw_plan_with_nthreads(omp_get_num_procs());
+#else
    fftw_plan_with_nthreads(2);
+#endif /* HAVE_GOMP */
+  }
 #endif /* HAVE_LIBFFTW3_THREADS */

  fFFTin = new fftw_complex[(fSteps/2 + 1) * fSteps];
@ -481,7 +504,9 @@ void TBulkTriVortexMLFieldCalc::CalculateGrid() const {
  // ... but first check that the field is not larger than Hc2 and that we are dealing with a type II SC
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    #endif
    for (m = 0; m < NFFTsq; m++) {
      fFFTout[m] = field;
    }
@ -568,7 +593,9 @@ void TBulkTriVortexMLFieldCalc::CalculateGrid() const {
  fftw_execute(fFFTplan);

  // Multiply by the applied field
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    fFFTout[l] *= field;
  }
@ -595,8 +622,13 @@ TBulkTriVortexAGLFieldCalc::TBulkTriVortexAGLFieldCalc(const string& wisdom, con

 #ifdef HAVE_LIBFFTW3_THREADS
  int init_threads(fftw_init_threads());
-  if (init_threads)
+  if (init_threads) {
+#ifdef HAVE_GOMP
+    fftw_plan_with_nthreads(omp_get_num_procs());
+#else
    fftw_plan_with_nthreads(2);
+#endif /* HAVE_GOMP */
+  }
 #endif /* HAVE_LIBFFTW3_THREADS */

  fFFTin = new fftw_complex[(fSteps/2 + 1) * fSteps];
@ -653,7 +685,9 @@ void TBulkTriVortexAGLFieldCalc::CalculateGrid() const {
  // ... but first check that the field is not larger than Hc2 and that we are dealing with a type II SC
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    #endif
    for (m = 0; m < NFFTsq; m++) {
      fFFTout[m] = field;
    }
@ -747,7 +781,9 @@ void TBulkTriVortexAGLFieldCalc::CalculateGrid() const {
  fftw_execute(fFFTplan);

  // Multiply by the applied field
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    fFFTout[l] *= field;
  }
@ -787,8 +823,13 @@ TBulkTriVortexNGLFieldCalc::TBulkTriVortexNGLFieldCalc(const string& wisdom, con

 #ifdef HAVE_LIBFFTW3_THREADS
  int init_threads(fftw_init_threads());
-  if (init_threads)
+  if (init_threads) {
+#ifdef HAVE_GOMP
+    fftw_plan_with_nthreads(omp_get_num_procs());
+#else
    fftw_plan_with_nthreads(2);
+#endif /* HAVE_GOMP */
+  }
 #endif /* HAVE_LIBFFTW3_THREADS */

  const unsigned int stepsSq(fSteps*fSteps);
@ -876,7 +917,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGradient() const {
  // Take the derivative of the Fourier sum of omega

  // First save a copy of the real aK-matrix in the imaginary part of the bK-matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; ++l) {
    fBkMatrix[l][1] = fFFTin[l][0];
  }
@ -912,7 +955,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGradient() const {
  fftw_execute(fFFTplan);

  // Copy the results to the gradient matrix and restore the original aK-matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; ++l) {
    fOmegaDiffMatrix[l][0] = fRealSpaceMatrix[l][1];
    fFFTin[l][0] = fBkMatrix[l][1];
@ -960,7 +1005,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGradient() const {
  fftw_execute(fFFTplan);

  // Copy the results to the gradient matrix and restore the original aK-matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; ++l) {
    fOmegaDiffMatrix[l][1] = fRealSpaceMatrix[l][1];
    fFFTin[l][0] = fBkMatrix[l][1];
@ -1503,7 +1550,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
  // first check that the field is not larger than Hc2 and that we are dealing with a type II SC ...
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    #endif
    for (m = 0; m < NFFTsq; m++) {
      fFFTout[m] = field;
    }
@ -1518,7 +1567,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

  FillAbrikosovCoefficients();

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFT; l++) {
    fCheckAkConvergence[l] = fFFTin[l][0];
  }
@ -1531,7 +1582,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

  fftw_execute(fFFTplan);

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    fOmegaMatrix[l] = fSumAk - fRealSpaceMatrix[l][0];
  }
@ -1544,7 +1597,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

  double denomQA;

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l,denomQA) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    if (!fOmegaMatrix[l] || !l || (l == (NFFT+1)*NFFT_2)) {
      fQMatrixA[l][0] = 0.0;
@ -1569,7 +1624,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
 */
  // initialize B(x,y) with the mean field

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    fFFTout[l] = scaledB;
  }
@ -1581,8 +1638,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
  while (!akConverged || !bkConverged) {

    // First iteration step for aK
-
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsq; l++) {
      if (fOmegaMatrix[l]) {
        fRealSpaceMatrix[l][0] = fOmegaMatrix[l]*(fOmegaMatrix[l] + fQMatrix[l][0]*fQMatrix[l][0] + fQMatrix[l][1]*fQMatrix[l][1] - 2.0) + \
@ -1610,7 +1668,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

    // Need a copy of the aK-matrix since FFTW is manipulating the input in c2r and r2c transforms
    // Store it in the first half of the bK-matrix
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsq_2; l++) {
      fBkMatrix[l][0] = fFFTin[l][0];
    }
@ -1619,7 +1679,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

    fftw_execute(fFFTplan);

+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsq; l++) {
      fOmegaMatrix[l] = fSumAk - fRealSpaceMatrix[l][0];
    }
@ -1645,7 +1707,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
    }

    // Restore the aK-matrix from the bK-space and multiply with the spacial averages
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsq_2; l++) {
      fFFTin[l][0] = fBkMatrix[l][0]*fSumSum/fSumOmegaSq;
      fFFTin[l][1] = 0.0;
@ -1668,7 +1732,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
    }

    if (!akConverged) {
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
      for (l = 0; l < NFFT; l++) {
        fCheckAkConvergence[l] = fFFTin[l][0];
      }
@ -1689,7 +1755,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

    fftw_execute(fFFTplan);

+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsq; l++) {
      fOmegaMatrix[l] = fSumAk - fRealSpaceMatrix[l][0];
    }
@ -1698,7 +1766,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

    if (akInitiallyConverged) {  // if the aK iterations converged, go on with the bK calculation
      //cout << "converged, count=" << count << endl;
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTsq; l++) {
        fBkMatrix[l][0] = fOmegaMatrix[l]*fFFTout[l] + fSumAk*(scaledB - fFFTout[l]) + \
        fQMatrix[l][1]*fOmegaDiffMatrix[l][0] - fQMatrix[l][0]*fOmegaDiffMatrix[l][1];
@ -1717,7 +1787,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
      // Check the convergence of the bK-iterations

      if (firstBkCalculation) {
+        #ifdef HAVE_GOMP
        #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+        #endif
        for (l = 0; l < NFFT; l++) {
          fCheckBkConvergence[l] = 0.0;
        }
@ -1741,7 +1813,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
      // cout << "Bk Convergence: " << bkConverged << endl;

      if (!bkConverged) {
+        #ifdef HAVE_GOMP
        #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+        #endif
        for (l = 0; l < NFFT; l++) {
          fCheckBkConvergence[l] = fBkMatrix[l][0];
        }
@ -1751,7 +1825,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

      // In order to save memory I will not allocate more space for another matrix but save a copy of the bKs in the aK-Matrix
      // Since aK is only half the size of bK, store every second entry in the imaginary part of aK
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTsq; l+=2) {
        fFFTin[l/2][0] = fBkMatrix[l][0];
        fFFTin[l/2][1] = fBkMatrix[l+1][0];
@ -1761,7 +1837,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

      fftw_execute(fFFTplanBkToBandQ);

+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTsq; l++) {
        fFFTout[l] = scaledB + fBkMatrix[l][0];
      }
@ -1770,7 +1848,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {
        break;

      // Restore bKs for Qx calculation and Fourier transform to get Qx
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTsq; l+=2) {
        fBkMatrix[l][0] = fFFTin[l/2][0];
        fBkMatrix[l+1][0] = fFFTin[l/2][1];
@ -1782,13 +1862,18 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

      fftw_execute(fFFTplanBkToBandQ);

+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTsq; l++) {
        fQMatrix[l][0] = fQMatrixA[l][0] - fBkMatrix[l][1];
      }

      // Restore bKs for Qy calculation and Fourier transform to get Qy
+
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTsq; l+=2) {
        fBkMatrix[l][0] = fFFTin[l/2][0];
        fBkMatrix[l+1][0] = fFFTin[l/2][1];
@ -1800,7 +1885,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

      fftw_execute(fFFTplanBkToBandQ);

+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTsq; l++) {
        fQMatrix[l][1] = fQMatrixA[l][1] + fBkMatrix[l][1];
      }
@ -1809,7 +1896,9 @@ void TBulkTriVortexNGLFieldCalc::CalculateGrid() const {

  // If the iterations have converged, rescale the field from Brandt's units to Gauss

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsq; l++) {
    fFFTout[l] *= Hc2_kappa;
  }
--- a/src/external/TFitPofB-lib/classes/TFilmTriVortexFieldCalc.cpp
+++ b/src/external/TFitPofB-lib/classes/TFilmTriVortexFieldCalc.cpp
@ -40,8 +40,8 @@ using namespace std;

 #include "TMath.h"

-#define PI 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067
-#define TWOPI (2.0*3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067)
+#define PI 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067f
+#define TWOPI (2.0f*3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067f)

 const float fluxQuantum(2.067833667e7f); // in this case this is Gauss times square nm

@ -150,8 +150,13 @@ TFilmTriVortexNGLFieldCalc::TFilmTriVortexNGLFieldCalc(const string& wisdom, con

 #ifdef HAVE_LIBFFTW3_THREADS
  int init_threads(fftwf_init_threads());
-  if (init_threads)
+  if (init_threads) {
+#ifdef HAVE_GOMP
+    fftwf_plan_with_nthreads(omp_get_num_procs());
+#else
    fftwf_plan_with_nthreads(2);
+#endif /* HAVE_GOMP */
+  }
 #endif /* HAVE_LIBFFTW3_THREADS */

  const unsigned int stepsSqStZ(fSteps*fSteps*fStepsZ);
@ -277,7 +282,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
  int i, j, k, l, index;

  // First save a copy of the real aK-matrix in the imaginary part of the bK-matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][1] = fFFTin[l][0];
  }
@ -285,14 +292,14 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
  // sum_K aK Kx^2 cos(Kx*x + Ky*y) cos(Kz*z)
  // First multiply the aK with Kx^2, then call FFTW

-  float coeffKx(4.0/3.0*pow(PI/fLatticeConstant, 2.0));;
+  float coeffKx(4.0/3.0*pow(PI/fLatticeConstant, 2.0f));;

  // k = 0

  // even rows
  for (i = 0; i < NFFT; i += 2) {
    // j = 0
-    fFFTin[fStepsZ*NFFT*i][0] = 0.0;
+    fFFTin[fStepsZ*NFFT*i][0] = 0.0f;
    // j != 0
    for (j = 2; j < NFFT_2; j += 2) {
      fFFTin[fStepsZ*(j + NFFT*i)][0] *= coeffKx*static_cast<float>(j*j);
@ -319,7 +326,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
      // even rows
      for (i = 0; i < NFFT; i += 2) {
        // j = 0
-        fFFTin[k + NFFTz*NFFT*i][0] = 0.0;
+        fFFTin[k + NFFTz*NFFT*i][0] = 0.0f;
        // j != 0
        for (j = 2; j < NFFT_2; j += 2) {
          fFFTin[k + NFFTz*(j + NFFT*i)][0] *= coeffKx*static_cast<float>(j*j);
@ -348,7 +355,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
    denom[k] = fRealSpaceMatrix[k][0];
    fGstorage[k] = fRealSpaceMatrix[k][0]*fRealSpaceMatrix[k][0];
  }
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
  }
@ -356,14 +365,14 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
  // sum_K aK Kx Ky cos(Kx*x + Ky*y) cos(Kz*z)
  // First multiply the aK with Kx*Ky, then call FFTW

-  const float coeffKxKy = (4.0/sqrt3*pow(PI/fLatticeConstant, 2.0));
+  const float coeffKxKy = (4.0f/sqrt3*pow(PI/fLatticeConstant, 2.0f));

  // k = 0

  // even rows
  for (i = 0; i < NFFT_2; i += 2) {
    // j = 0
-    fFFTin[fStepsZ*NFFT*i][0] = 0.0;
+    fFFTin[fStepsZ*NFFT*i][0] = 0.0f;
    // j != 0
    for (j = 2; j < NFFT_2; j += 2) {
      fFFTin[fStepsZ*(j + NFFT*i)][0] *= coeffKxKy*static_cast<float>(j*i);
@ -409,7 +418,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
      // even rows
      for (i = 0; i < NFFT_2; i += 2) {
        // j = 0
-        fFFTin[k + fStepsZ*NFFT*i][0] = 0.0;
+        fFFTin[k + fStepsZ*NFFT*i][0] = 0.0f;
        // j != 0
        for (j = 2; j < NFFT_2; j += 2) {
          fFFTin[k + fStepsZ*(j + NFFT*i)][0] *= coeffKxKy*static_cast<float>(j*i);
@ -420,7 +429,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
      }
      for (i = NFFT_2; i < NFFT; i += 2) {
        // j = 0
-        fFFTin[k + fStepsZ*NFFT*i][0] = 0.0;
+        fFFTin[k + fStepsZ*NFFT*i][0] = 0.0f;
        // j != 0
        for (j = 2; j < NFFT_2; j += 2) {
          fFFTin[k + fStepsZ*(j + NFFT*i)][0] *= coeffKxKy*static_cast<float>(j*(i - NFFT));
@ -456,7 +465,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
  for (k = 0; k < NFFTz; ++k) {
    fGstorage[k] += fRealSpaceMatrix[k][0]*fRealSpaceMatrix[k][0];
  }
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
  }
@ -564,7 +575,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
  for (k = 0; k < NFFTz; ++k) {
    fGstorage[k] += fRealSpaceMatrix[k][1]*fRealSpaceMatrix[k][1];
  }
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
  }
@ -597,7 +610,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
  // This is going to be a bit lengthy...

  // First save a copy of the real aK-matrix in the imaginary part of the bK-matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][1] = fFFTin[l][0];
  }
@ -664,7 +679,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
  fftwf_execute(fFFTplan);

  // Copy the results to the gradient matrix and restore the original aK-matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fOmegaDiffMatrix[0][l] = fRealSpaceMatrix[l][1];
    fFFTin[l][0] = fBkMatrix[l][1];
@ -752,7 +769,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
  fftwf_execute(fFFTplan);

  // Copy the results to the gradient matrix and restore the original aK-matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fOmegaDiffMatrix[1][l] = fRealSpaceMatrix[l][1];
    fFFTin[l][0] = fBkMatrix[l][1];
@ -814,21 +833,27 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {

    // Copy the results to the gradient matrix - with the 1D-FORWARD-transform we have to _add_ fSumAk
    for (k = 0; k < NFFTz; ++k) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaDiffMatrix[2][k + NFFTz*index] = fRealSpaceMatrix[k + NFFTz*index][1] + fSumAk[k][1];
      }
    }

    // Restore the original aK-matrix
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fFFTin[l][0] = fBkMatrix[l][1];
      fBkMatrix[l][1] = 0.0;
    }
  } else {
    // For the 2D solution, dw/dz = 0
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fOmegaDiffMatrix[2][l] = 0.0;
      fBkMatrix[l][1] = 0.0;
@ -973,7 +998,9 @@ void TFilmTriVortexNGLFieldCalc::FillAbrikosovCoefficients(const float reducedFi
  fFFTin[0][0] = 0.0;

  for (k = 1; k < NFFTz; ++k) {
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+    #endif
    for (index = 0; index < NFFTsq; ++index) {
      fFFTin[k + NFFTz*index][0] = 0.0;
      fFFTin[k + NFFTz*index][1] = 0.0;
@ -1666,7 +1693,9 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsB() const {
 */
  } else { // for 2D solution only
    for (k = 1; k < NFFTz; ++k) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fBkMatrix[k + NFFTz*index][0] = 0.0;
        fBkMatrix[k + NFFTz*index][1] = 0.0;
@ -1937,7 +1966,9 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpXFirst() c
  float ii;

  // k = 0
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
  }
@ -2080,7 +2111,9 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpXSecond()
  float ii;

  // k = 0
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
  }
@ -2219,7 +2252,9 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpYFirst() c
  float ii;

  // k = 0
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
  }
@ -2354,7 +2389,9 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpYSecond()
  float ii;

  // k = 0
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
  }
@ -2534,7 +2571,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // first check that the field is not larger than Hc2 and that we are dealing with a type II SC ...
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    #endif
    for (m = 0; m < NFFTsqStZ; ++m) {
      fBout[0][m] = 0.0f;
      fBout[1][m] = 0.0f;
@ -2554,7 +2593,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // save a few coefficients for the convergence check

  for (k = 0; k < NFFTz; ++k) {
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(j,index) schedule(dynamic)
+    #endif
    for (j = 0; j < NFFT; ++j) {
      index = k + NFFTz*j;
      fCheckAkConvergence[index] = fFFTin[index][0];
@ -2577,7 +2618,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  for (k = 0; k < NFFTz; ++k) {
    for (j = 0; j < NFFT; ++j) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(i,index) schedule(dynamic)
+      #endif
      for (i = 0; i < NFFT; ++i) {
        index = k + NFFTz*(j + NFFT*i);
        fOmegaMatrix[index] = fSumAk[k][0] - fRealSpaceMatrix[index][0];
@ -2594,7 +2637,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  float denomQAInv;
  int indexQA;

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index,denomQAInv) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    if (!fOmegaMatrix[NFFTz*index] || !index || (index == (NFFT+1)*NFFT_2)) {
      fQMatrixA[index][0] = 0.0;
@ -2619,7 +2664,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Initialize the Q-Matrix with Q-Abrikosov

  for (k = 0; k < NFFTz; ++k) {
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+    #endif
    for (index = 0; index < NFFTsq; ++index) {
      fQMatrix[k + NFFTz*index][0] = fQMatrixA[index][0];
      fQMatrix[k + NFFTz*index][1] = fQMatrixA[index][1];
@ -2627,7 +2674,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  }

  // initialize the bK-Matrix
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][0] = 0.0;
 //    fBkMatrix[l][1] = 0.0;
@ -2651,7 +2700,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
 //         cout << "g[" << k << "] = " << fGstorage[k] << endl;
 //       }

+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsqStZ; l++) {
      if (fOmegaMatrix[l]) {
        fRealSpaceMatrix[l][0] = fOmegaMatrix[l]*(fOmegaMatrix[l] + fQMatrix[l][0]*fQMatrix[l][0] + fQMatrix[l][1]*fQMatrix[l][1] - 2.0) + \
@ -2668,7 +2719,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // since all of this should be a smooth function anyway, I set the value of the next neighbour r
    // for the two vortex core positions in my matrix
    // If this was not enough we can get the g(0)-values by an expensive CalculateGatVortexCore()-invocation (not working at the moment)
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(k) schedule(dynamic)
+    #endif
    for (k = 0; k < NFFTz; ++k) {
      fRealSpaceMatrix[k][0] = fRealSpaceMatrix[k + fStepsZ*fSteps][0];//fGstorage[k];
      fRealSpaceMatrix[k + NFFTz*(NFFT+1)*NFFT_2][0] = fRealSpaceMatrix[k][0];//fGstorage[k];
@ -2688,7 +2741,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    fftwf_execute(fFFTplan);

    for (k = 0; k < NFFTz; ++k) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaMatrix[k + NFFTz*index] = fSumAk[k][0] - fRealSpaceMatrix[k + NFFTz*index][0];
      }
@ -2731,7 +2786,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // Multiply the aK with the spacial averages
    for (k = 0; k < NFFTz; ++k) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fFFTin[k + NFFTz*index][0] = fFFTin[k + NFFTz*index][0]*fSumSum;
      }
@ -2760,7 +2817,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    if (!akConverged) {
      for (k = 0; k < NFFTz; ++k) {
+        #ifdef HAVE_GOMP
        #pragma omp parallel for default(shared) private(j, index) schedule(dynamic)
+        #endif
        for (j = 0; j < NFFT; ++j) {
          index = k + NFFTz*j;
          fCheckAkConvergence[index] = fFFTin[index][0];
@ -2791,7 +2850,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    fftwf_execute(fFFTplan);

    for (k = 0; k < NFFTz; ++k) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaMatrix[k + NFFTz*index] = fSumAk[k][0] - fRealSpaceMatrix[k + NFFTz*index][0];
      }
@ -2800,7 +2861,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    CalculateGradient();

    // first calculate PK (use the Q-Matrix memory for the second part)
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fPkMatrix[l][0] = fOmegaMatrix[l]*fQMatrix[l][1];
      fQMatrix[l][0] = fOmegaMatrix[l]*fQMatrix[l][0];
@ -2834,7 +2897,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // Check the convergence of the bK-iterations

    if (firstBkCalculation) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      #endif
      for (l = 0; l < NFFTStZ; ++l) {
        fCheckBkConvergence[l] = 0.0f;
      }
@ -2866,7 +2931,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    if (!bkConverged) {
      for (k = 0; k < NFFTz; ++k) {
+        #ifdef HAVE_GOMP
        #pragma omp parallel for default(shared) private(j) schedule(dynamic)
+        #endif
        for (j = 0; j < NFFT; ++j) {
          index = k + NFFTz*j;
          fCheckBkConvergence[index] = fBkMatrix[index][0];
@ -2875,7 +2942,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    }

    // In order to save memory I will not allocate more space for another matrix but save a copy of the bKs in the aK-Matrix
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fFFTin[l][1] = fBkMatrix[l][0];
    }
@ -2911,14 +2980,18 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    fftwf_execute(fFFTplanBkToBandQ);

    for (k = 0; k < NFFTz; ++k) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fQMatrix[k + NFFTz*index][0] = fQMatrixA[index][0] - fBkMatrix[k + NFFTz*index][1];
      }
    }

    // Restore bKs for Qy calculation and Fourier transform to get Qy
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fBkMatrix[l][0] = fFFTin[l][1];
      fBkMatrix[l][1] = 0.0;
@ -2929,14 +3002,18 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    fftwf_execute(fFFTplanBkToBandQ);

    for (k = 0; k < NFFTz; ++k) {
+      #ifdef HAVE_GOMP
      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fQMatrix[k + NFFTz*index][1] = fQMatrixA[index][1] + fBkMatrix[k + NFFTz*index][1];
      }
    }

    // Restore bKs for the next iteration
+    #ifdef HAVE_GOMP
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fBkMatrix[l][0] = fFFTin[l][1];
      fBkMatrix[l][1] = 0.0;
@ -2950,7 +3027,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  fftwf_execute(fFFTplanBkToBandQ);

  // Fill in the B-Matrix and restore the bKs for the second part of the Bx-calculation
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[0][l] = fBkMatrix[l][0];
    fBkMatrix[l][0] = fFFTin[l][1];
@ -2962,7 +3041,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  fftwf_execute(fFFTplanBkToBandQ);

  // Fill in the B-Matrix and restore the bKs for the By-calculation
+
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[0][l] = 0.5f*(fBkMatrix[l][0] - fBout[0][l])*Hc2_kappa;
    fBkMatrix[l][0] = fFFTin[l][1];
@ -2974,7 +3056,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  fftwf_execute(fFFTplanBkToBandQ);

  // Fill in the B-Matrix and restore the bKs for the second part of the By-calculation
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[1][l] = fBkMatrix[l][0];
    fBkMatrix[l][0] = fFFTin[l][1];
@ -2986,7 +3070,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  fftwf_execute(fFFTplanBkToBandQ);

  // Fill in the B-Matrix and restore the bKs for the second part of the By-calculation
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[1][l] = 0.5f*(fBkMatrix[l][0] - fBout[1][l])*Hc2_kappa;
    fBkMatrix[l][0] = fFFTin[l][1];
@ -2995,7 +3081,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  fftwf_execute(fFFTplanBkToBandQ);

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[2][l] = (scaledB + fBkMatrix[l][0])*Hc2_kappa;
  }
@ -3003,7 +3091,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Since the surface is not included in the Bx and By-calculation above, we do another step

  // Save a copy of the BkS - where does not matter since this is the very end of the calculation...
+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fFFTin[index][1] = fBkS[index][0];
  }
@ -3014,7 +3104,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Write the surface fields to the field-Matrix and restore the BkS for the By-calculation

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBout[0][NFFTz/2 + NFFTz*index] = fBkS[index][1]*Hc2_kappa;
    fBkS[index][0] = fFFTin[index][1];
@ -3027,7 +3119,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Write the surface fields to the field-Matrix

+  #ifdef HAVE_GOMP
  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBout[1][NFFTz/2 + NFFTz*index] = fBkS[index][1]*Hc2_kappa;
  }
--- a/src/external/TFitPofB-lib/classes/TPofBCalc.cpp
+++ b/src/external/TFitPofB-lib/classes/TPofBCalc.cpp
@ -108,7 +108,9 @@ TPofBCalc::TPofBCalc(const vector<double>& b, const vector<double>& pb, double d

 void TPofBCalc::UnsetPBExists() {
  int i;
+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+#endif
  for (i = 0; i < static_cast<int>(fPBSize); i++) {
    fPB[i] = 0.0;
  }
@ -478,7 +480,9 @@ void TPofBCalc::Calculate(const TBulkVortexFieldCalc *vortexLattice, const vecto
    sum += fPB[i];
  sum *= fDB;
  int i;
+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+#endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i)
    fPB[i] /= sum;
 // end pragma omp parallel
--- a/src/external/TFitPofB-lib/classes/TPofTCalc.cpp
+++ b/src/external/TFitPofB-lib/classes/TPofTCalc.cpp
@ -43,6 +43,7 @@
 #include <cstdlib>

 #include <omp.h>
+#include <boost/thread.hpp>

 #include <TString.h>
 #include <TObjArray.h>
@ -65,12 +66,18 @@

 TPofTCalc::TPofTCalc (const TPofBCalc *PofB, const string &wisdom, const vector<double> &par) : fWisdom(wisdom) {

+
 #ifdef HAVE_LIBFFTW3_THREADS
  int init_threads(fftw_init_threads());
  if (!init_threads)
    cout << "TPofTCalc::TPofTCalc: Couldn't initialize multiple FFTW-threads ..." << endl;
-  else
+  else {
+#ifdef HAVE_GOMP
+    fftw_plan_with_nthreads(omp_get_num_procs());
+#else
    fftw_plan_with_nthreads(2);
+#endif /* HAVE_GOMP */
+  }
 #endif /* HAVE_LIBFFTW3_THREADS */

  fNFFT = static_cast<int>(1.0/(gBar*par[1]*par[2]));
@ -91,7 +98,9 @@ TPofTCalc::TPofTCalc (const TPofBCalc *PofB, const string &wisdom, const vector<

  int i;

+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+#endif
  for (i = 0; i < NFFT_2p1; i++) {
    fT[i] = static_cast<double>(i)*fTBin;
  }
@ -177,7 +186,9 @@ void TPofTCalc::CalcPol(const vector<double> &par) {
  double sinph(sin(par[0]*PI/180.0)), cosph(cos(par[0]*PI/180.0));
  int i;

+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+#endif
  for (i=0; i<fNFFT/2+1; i++){
    fPT[i] = (cosph*fFFTout[i][0] + sinph*fFFTout[i][1])*par[2];
  }
@ -248,7 +259,9 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>
    // calculate asymmetry
    CalcPol(param);

+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(j,ttime,k) schedule(dynamic)
+#endif
    for(j=0; j<nChannels; j++) {
      ttime=j*par[2];
      k = static_cast<int>(floor(ttime/fTBin));
@ -274,7 +287,9 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>

  for (unsigned int i(0); i<numHist; i++) {    // loop over all histos

+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(j) schedule(dynamic)
+#endif
    for (j = 0; j<nChannels; j++) { // loop over time
      if (j < t0[i]) // j<t0
        data[j] = bg[i]; // background
@ -308,7 +323,9 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>
    name  += i;
    fakeHisto = new TH1F(name.Data(), name.Data(), int(par[3]), -par[2]/2.0, (par[3]+0.5)*par[2]);
    // fill theoHisto
+#ifdef HAVE_GOMP
 #pragma omp parallel for default(shared) private(j) schedule(dynamic)
+#endif
    for (j = 0; j<nChannels; j++)
      theoHisto->SetBinContent(j, histo[i][j]);
 // end omp