Applied a few minor optimizations to the field-distribution calculations

2011-05-28 13:07:36 +00:00
parent db4425e352
commit e22256ef8a
5 changed files with 1423 additions and 922 deletions
--- a/src/external/libFitPofB/classes/TFilmTriVortexFieldCalc.cpp
+++ b/src/external/libFitPofB/classes/TFilmTriVortexFieldCalc.cpp
@@ -290,7 +290,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {

  // First save a copy of the real aK-matrix in the imaginary part of the bK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  int chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][1] = fFFTin[l][0];
@@ -299,61 +302,86 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
  // sum_K aK Kx^2 cos(Kx*x + Ky*y) cos(Kz*z)
  // First multiply the aK with Kx^2, then call FFTW

-  float coeffKx(4.0/3.0*pow(PI/fLatticeConstant, 2.0f));;
+  float coeffKx(4.0/3.0*pow(PI/fLatticeConstant, 2.0f));

  // k = 0
-
-  // even rows
-  for (i = 0; i < NFFT; i += 2) {
-    // j = 0
-    fFFTin[fStepsZ*NFFT*i][0] = 0.0f;
-    // j != 0
-    for (j = 2; j < NFFT_2; j += 2) {
-      fFFTin[fStepsZ*(j + NFFT*i)][0] *= coeffKx*static_cast<float>(j*j);
-    }
-    for (j = NFFT_2; j < NFFT; j += 2) {
-      fFFTin[fStepsZ*(j + NFFT*i)][0] *= coeffKx*static_cast<float>((j - NFFT)*(j - NFFT));
-    }
-  }
-
-  // odd rows
-  for (i = 1; i < NFFT; i += 2) {
-    for (j = 0; j < NFFT_2; j += 2) {
-      fFFTin[fStepsZ*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1)*(j + 1));
-    }
-    for (j = NFFT_2; j < NFFT; j += 2) {
-      fFFTin[fStepsZ*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1 - NFFT)*(j + 1 - NFFT));
-    }
-  }
-
-  // k != 0
-
-  if (fFind3dSolution) {
-    for (k = 1; k < NFFTz; ++k) {
+  #ifdef HAVE_GOMP
+  #pragma omp parallel default(shared) private(i,j)
+  {
+    #pragma omp sections
+    {
      // even rows
+      #pragma omp section
+  #endif
      for (i = 0; i < NFFT; i += 2) {
        // j = 0
-        fFFTin[k + NFFTz*NFFT*i][0] = 0.0f;
+        fFFTin[fStepsZ*NFFT*i][0] = 0.0f;
        // j != 0
        for (j = 2; j < NFFT_2; j += 2) {
-          fFFTin[k + NFFTz*(j + NFFT*i)][0] *= coeffKx*static_cast<float>(j*j);
+          fFFTin[fStepsZ*(j + NFFT*i)][0] *= coeffKx*static_cast<float>(j*j);
        }
        for (j = NFFT_2; j < NFFT; j += 2) {
-          fFFTin[k + NFFTz*(j + NFFT*i)][0] *= coeffKx*static_cast<float>((j - NFFT)*(j - NFFT));
+          fFFTin[fStepsZ*(j + NFFT*i)][0] *= coeffKx*static_cast<float>((j - NFFT)*(j - NFFT));
        }
      }

      // odd rows
+      #ifdef HAVE_GOMP
+      #pragma omp section
+      #endif
      for (i = 1; i < NFFT; i += 2) {
        for (j = 0; j < NFFT_2; j += 2) {
-          fFFTin[k + NFFTz*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1)*(j + 1));
+          fFFTin[fStepsZ*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1)*(j + 1));
        }
        for (j = NFFT_2; j < NFFT; j += 2) {
-          fFFTin[k + NFFTz*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1 - NFFT)*(j + 1 - NFFT));
+          fFFTin[fStepsZ*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1 - NFFT)*(j + 1 - NFFT));
        }
      }
-    }
-  } // else do nothing since the other aK are already zero since the former aK manipulation
+    #ifdef HAVE_GOMP
+    } // end omp sections
+    #endif
+
+    // k != 0
+    if (fFind3dSolution) {
+      for (k = 1; k < NFFTz; ++k) {
+        #ifdef HAVE_GOMP
+        #pragma omp sections
+        {
+          // even rows
+          #pragma omp section
+        #endif
+          for (i = 0; i < NFFT; i += 2) {
+            // j = 0
+            fFFTin[k + NFFTz*NFFT*i][0] = 0.0f;
+            // j != 0
+            for (j = 2; j < NFFT_2; j += 2) {
+              fFFTin[k + NFFTz*(j + NFFT*i)][0] *= coeffKx*static_cast<float>(j*j);
+            }
+            for (j = NFFT_2; j < NFFT; j += 2) {
+              fFFTin[k + NFFTz*(j + NFFT*i)][0] *= coeffKx*static_cast<float>((j - NFFT)*(j - NFFT));
+            }
+          }
+
+          // odd rows
+          #ifdef HAVE_GOMP
+          #pragma omp section
+          #endif
+          for (i = 1; i < NFFT; i += 2) {
+            for (j = 0; j < NFFT_2; j += 2) {
+              fFFTin[k + NFFTz*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1)*(j + 1));
+            }
+            for (j = NFFT_2; j < NFFT; j += 2) {
+              fFFTin[k + NFFTz*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1 - NFFT)*(j + 1 - NFFT));
+            }
+          }
+        #ifdef HAVE_GOMP
+        }
+        #endif
+      }
+    } // else do nothing since the other aK are already zero since the former aK manipulation
+  #ifdef HAVE_GOMP
+  } // end omp parallel
+  #endif

  fftwf_execute(fFFTplan);

@@ -363,7 +391,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
    fGstorage[k] = fRealSpaceMatrix[k][0]*fRealSpaceMatrix[k][0];
  }
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
@@ -473,7 +501,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
    fGstorage[k] += fRealSpaceMatrix[k][0]*fRealSpaceMatrix[k][0];
  }
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
@@ -583,7 +611,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
    fGstorage[k] += fRealSpaceMatrix[k][1]*fRealSpaceMatrix[k][1];
  }
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
@@ -612,13 +640,18 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
  const int NFFTz_2(fStepsZ/2);

  int i, j, k, l, index;
+  #ifdef HAVE_GOMP
+  int chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif

  // Take the derivative of the Fourier sum of omega
  // This is going to be a bit lengthy...

  // First save a copy of the real aK-matrix in the imaginary part of the bK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][1] = fFFTin[l][0];
@@ -687,7 +720,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {

  // Copy the results to the gradient matrix and restore the original aK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fOmegaDiffMatrix[0][l] = fRealSpaceMatrix[l][1];
@@ -777,7 +810,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {

  // Copy the results to the gradient matrix and restore the original aK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fOmegaDiffMatrix[1][l] = fRealSpaceMatrix[l][1];
@@ -838,10 +871,16 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
    // 3D transform
    fftwf_execute(fFFTplan);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    // Copy the results to the gradient matrix - with the 1D-FORWARD-transform we have to _add_ fSumAk
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaDiffMatrix[2][k + NFFTz*index] = fRealSpaceMatrix[k + NFFTz*index][1] + fSumAk[k][1];
@@ -850,7 +889,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {

    // Restore the original aK-matrix
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fFFTin[l][0] = fBkMatrix[l][1];
@@ -859,7 +901,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
  } else {
    // For the 2D solution, dw/dz = 0
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fOmegaDiffMatrix[2][l] = 0.0;
@@ -1004,9 +1046,15 @@ void TFilmTriVortexNGLFieldCalc::FillAbrikosovCoefficients(const float reducedFi

  fFFTin[0][0] = 0.0;

+  #ifdef HAVE_GOMP
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif
+
  for (k = 1; k < NFFTz; ++k) {
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+    #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
    #endif
    for (index = 0; index < NFFTsq; ++index) {
      fFFTin[k + NFFTz*index][0] = 0.0;
@@ -1699,9 +1747,16 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsB() const {
    }
 */
  } else { // for 2D solution only
+
+    #ifdef HAVE_GOMP
+    int chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 1; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fBkMatrix[k + NFFTz*index][0] = 0.0;
@@ -1971,10 +2026,15 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpXFirst() c

  int i, j, k, kx, ky, kz, index;
  float ii;
+  #ifdef HAVE_GOMP
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@@ -2116,10 +2176,15 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpXSecond()

  int i, j, k, kx, ky, kz, index;
  float ii;
+  #ifdef HAVE_GOMP
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@@ -2260,7 +2325,10 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpYFirst() c

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@@ -2397,7 +2465,10 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpYSecond()

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@@ -2575,11 +2646,18 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  const int NFFTz_2(fStepsZ/2);
  const int NFFTsqStZ_2(NFFTsq*(fStepsZ/2 + 1));

+  #ifdef HAVE_GOMP
+  int chunk;
+  #endif
+
  // first check that the field is not larger than Hc2 and that we are dealing with a type II SC ...
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(m) schedule(dynamic,chunk)
    #endif
    for (m = 0; m < NFFTsqStZ; ++m) {
      fBout[0][m] = 0.0f;
@@ -2601,7 +2679,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  for (k = 0; k < NFFTz; ++k) {
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(j,index) schedule(dynamic)
+    chunk = NFFT/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(j,index) schedule(dynamic,chunk)
    #endif
    for (j = 0; j < NFFT; ++j) {
      index = k + NFFTz*j;
@@ -2626,7 +2707,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  for (k = 0; k < NFFTz; ++k) {
    for (j = 0; j < NFFT; ++j) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(i,index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(i,index) schedule(dynamic,chunk)
      #endif
      for (i = 0; i < NFFT; ++i) {
        index = k + NFFTz*(j + NFFT*i);
@@ -2645,7 +2726,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  int indexQA;

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index,denomQAInv) schedule(dynamic)
+  chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index,denomQAInv) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    if (!fOmegaMatrix[NFFTz*index] || !index || (index == (NFFT+1)*NFFT_2)) {
@@ -2672,7 +2756,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  for (k = 0; k < NFFTz; ++k) {
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+    #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
    #endif
    for (index = 0; index < NFFTsq; ++index) {
      fQMatrix[k + NFFTz*index][0] = fQMatrixA[index][0];
@@ -2682,7 +2766,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // initialize the bK-Matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][0] = 0.0;
@@ -2708,7 +2795,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
 //       }

    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; l++) {
      if (fOmegaMatrix[l]) {
@@ -2726,9 +2816,6 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // since all of this should be a smooth function anyway, I set the value of the next neighbour r
    // for the two vortex core positions in my matrix
    // If this was not enough we can get the g(0)-values by an expensive CalculateGatVortexCore()-invocation (not working at the moment)
-    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(k) schedule(dynamic)
-    #endif
    for (k = 0; k < NFFTz; ++k) {
      fRealSpaceMatrix[k][0] = fRealSpaceMatrix[k + fStepsZ*fSteps][0];//fGstorage[k];
      fRealSpaceMatrix[k + NFFTz*(NFFT+1)*NFFT_2][0] = fRealSpaceMatrix[k][0];//fGstorage[k];
@@ -2747,9 +2834,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplan);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaMatrix[k + NFFTz*index] = fSumAk[k][0] - fRealSpaceMatrix[k + NFFTz*index][0];
@@ -2794,7 +2887,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // Multiply the aK with the spacial averages
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fFFTin[k + NFFTz*index][0] = fFFTin[k + NFFTz*index][0]*fSumSum;
@@ -2823,9 +2916,14 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    }

    if (!akConverged) {
+      #ifdef HAVE_GOMP
+      chunk = NFFT/omp_get_num_procs();
+      if (chunk < 10)
+        chunk = 10;
+      #endif
      for (k = 0; k < NFFTz; ++k) {
        #ifdef HAVE_GOMP
-        #pragma omp parallel for default(shared) private(j, index) schedule(dynamic)
+        #pragma omp parallel for default(shared) private(j, index) schedule(dynamic,chunk)
        #endif
        for (j = 0; j < NFFT; ++j) {
          index = k + NFFTz*j;
@@ -2856,9 +2954,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplan);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaMatrix[k + NFFTz*index] = fSumAk[k][0] - fRealSpaceMatrix[k + NFFTz*index][0];
@@ -2869,6 +2973,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // first calculate PK (use the Q-Matrix memory for the second part)
    #ifdef HAVE_GOMP
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
@@ -2905,7 +3012,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    if (firstBkCalculation) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      chunk = NFFTStZ/omp_get_num_procs();
+      if (chunk < 10)
+        chunk = 10;
+      #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
      #endif
      for (l = 0; l < NFFTStZ; ++l) {
        fCheckBkConvergence[l] = 0.0f;
@@ -2937,9 +3047,14 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // cout << "Bk Convergence: " << bkConverged << endl;

    if (!bkConverged) {
+      #ifdef HAVE_GOMP
+      chunk = NFFT/omp_get_num_procs();
+      if (chunk < 10)
+        chunk = 10;
+      #endif
      for (k = 0; k < NFFTz; ++k) {
        #ifdef HAVE_GOMP
-        #pragma omp parallel for default(shared) private(j) schedule(dynamic)
+        #pragma omp parallel for default(shared) private(j) schedule(dynamic,chunk)
        #endif
        for (j = 0; j < NFFT; ++j) {
          index = k + NFFTz*j;
@@ -2950,7 +3065,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // In order to save memory I will not allocate more space for another matrix but save a copy of the bKs in the aK-Matrix
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fFFTin[l][1] = fBkMatrix[l][0];
@@ -2986,9 +3104,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplanBkToBandQ);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fQMatrix[k + NFFTz*index][0] = fQMatrixA[index][0] - fBkMatrix[k + NFFTz*index][1];
@@ -2997,7 +3121,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // Restore bKs for Qy calculation and Fourier transform to get Qy
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fBkMatrix[l][0] = fFFTin[l][1];
@@ -3008,9 +3135,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplanBkToBandQ);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fQMatrix[k + NFFTz*index][1] = fQMatrixA[index][1] + fBkMatrix[k + NFFTz*index][1];
@@ -3019,7 +3152,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // Restore bKs for the next iteration
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fBkMatrix[l][0] = fFFTin[l][1];
@@ -3035,7 +3171,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Fill in the B-Matrix and restore the bKs for the second part of the Bx-calculation
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[0][l] = fBkMatrix[l][0];
@@ -3050,7 +3189,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Fill in the B-Matrix and restore the bKs for the By-calculation

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[0][l] = 0.5f*(fBkMatrix[l][0] - fBout[0][l])*Hc2_kappa;
@@ -3064,7 +3203,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Fill in the B-Matrix and restore the bKs for the second part of the By-calculation
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[1][l] = fBkMatrix[l][0];
@@ -3078,7 +3217,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Fill in the B-Matrix and restore the bKs for the second part of the By-calculation
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[1][l] = 0.5f*(fBkMatrix[l][0] - fBout[1][l])*Hc2_kappa;
@@ -3089,7 +3228,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  fftwf_execute(fFFTplanBkToBandQ);

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[2][l] = (scaledB + fBkMatrix[l][0])*Hc2_kappa;
@@ -3099,7 +3238,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Save a copy of the BkS - where does not matter since this is the very end of the calculation...
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fFFTin[index][1] = fBkS[index][0];
@@ -3112,7 +3254,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Write the surface fields to the field-Matrix and restore the BkS for the By-calculation

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBout[0][NFFTz/2 + NFFTz*index] = fBkS[index][1]*Hc2_kappa;
@@ -3127,7 +3269,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Write the surface fields to the field-Matrix

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBout[1][NFFTz/2 + NFFTz*index] = fBkS[index][1]*Hc2_kappa;