Applied a few minor optimizations to the field-distribution calculations

2011-05-28 13:07:36 +00:00
parent db4425e352
commit e22256ef8a
5 changed files with 1423 additions and 922 deletions
--- a/src/external/libFitPofB/classes/TBofZCalc.cpp
+++ b/src/external/libFitPofB/classes/TBofZCalc.cpp
@ -57,10 +57,13 @@ void TBofZCalc::Calculate()
  fZ.resize(fSteps);
  fBZ.resize(fSteps);

-#ifdef HAVE_GOMP
-#pragma omp parallel for default(shared) private(j,ZZ) schedule(dynamic)
-#endif
-  for (j=0; j<fSteps; j++) {
+  #ifdef HAVE_GOMP
+  int chunk = fSteps/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(j,ZZ) schedule(dynamic,chunk)
+  #endif
+  for (j=0; j<fSteps; ++j) {
    ZZ = fParam[1] + static_cast<double>(j)*fDZ;
    fZ[j] = ZZ;
    fBZ[j] = GetBofZ(ZZ);
--- a/src/external/libFitPofB/classes/TBulkTriVortexFieldCalc.cpp
+++ b/src/external/libFitPofB/classes/TBulkTriVortexFieldCalc.cpp
--- a/src/external/libFitPofB/classes/TFilmTriVortexFieldCalc.cpp
+++ b/src/external/libFitPofB/classes/TFilmTriVortexFieldCalc.cpp
@ -290,7 +290,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {

  // First save a copy of the real aK-matrix in the imaginary part of the bK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  int chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][1] = fFFTin[l][0];
@ -299,11 +302,17 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
  // sum_K aK Kx^2 cos(Kx*x + Ky*y) cos(Kz*z)
  // First multiply the aK with Kx^2, then call FFTW

-  float coeffKx(4.0/3.0*pow(PI/fLatticeConstant, 2.0f));;
+  float coeffKx(4.0/3.0*pow(PI/fLatticeConstant, 2.0f));

  // k = 0
-
+  #ifdef HAVE_GOMP
+  #pragma omp parallel default(shared) private(i,j)
+  {
+    #pragma omp sections
+    {
      // even rows
+      #pragma omp section
+  #endif
      for (i = 0; i < NFFT; i += 2) {
        // j = 0
        fFFTin[fStepsZ*NFFT*i][0] = 0.0f;
@ -317,6 +326,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
      }

      // odd rows
+      #ifdef HAVE_GOMP
+      #pragma omp section
+      #endif
      for (i = 1; i < NFFT; i += 2) {
        for (j = 0; j < NFFT_2; j += 2) {
          fFFTin[fStepsZ*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1)*(j + 1));
@ -325,12 +337,19 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
          fFFTin[fStepsZ*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1 - NFFT)*(j + 1 - NFFT));
        }
      }
+    #ifdef HAVE_GOMP
+    } // end omp sections
+    #endif

    // k != 0
-
    if (fFind3dSolution) {
      for (k = 1; k < NFFTz; ++k) {
+        #ifdef HAVE_GOMP
+        #pragma omp sections
+        {
          // even rows
+          #pragma omp section
+        #endif
          for (i = 0; i < NFFT; i += 2) {
            // j = 0
            fFFTin[k + NFFTz*NFFT*i][0] = 0.0f;
@ -344,6 +363,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
          }

          // odd rows
+          #ifdef HAVE_GOMP
+          #pragma omp section
+          #endif
          for (i = 1; i < NFFT; i += 2) {
            for (j = 0; j < NFFT_2; j += 2) {
              fFFTin[k + NFFTz*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1)*(j + 1));
@ -352,8 +374,14 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
              fFFTin[k + NFFTz*(j + 1 + NFFT*i)][0] *= coeffKx*static_cast<float>((j + 1 - NFFT)*(j + 1 - NFFT));
            }
          }
+        #ifdef HAVE_GOMP
+        }
+        #endif
      }
    } // else do nothing since the other aK are already zero since the former aK manipulation
+  #ifdef HAVE_GOMP
+  } // end omp parallel
+  #endif

  fftwf_execute(fFFTplan);

@ -363,7 +391,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
    fGstorage[k] = fRealSpaceMatrix[k][0]*fRealSpaceMatrix[k][0];
  }
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
@ -473,7 +501,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
    fGstorage[k] += fRealSpaceMatrix[k][0]*fRealSpaceMatrix[k][0];
  }
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
@ -583,7 +611,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGatVortexCore() const {
    fGstorage[k] += fRealSpaceMatrix[k][1]*fRealSpaceMatrix[k][1];
  }
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fFFTin[l][0] = fBkMatrix[l][1];
@ -612,13 +640,18 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
  const int NFFTz_2(fStepsZ/2);

  int i, j, k, l, index;
+  #ifdef HAVE_GOMP
+  int chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif

  // Take the derivative of the Fourier sum of omega
  // This is going to be a bit lengthy...

  // First save a copy of the real aK-matrix in the imaginary part of the bK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][1] = fFFTin[l][0];
@ -687,7 +720,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {

  // Copy the results to the gradient matrix and restore the original aK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fOmegaDiffMatrix[0][l] = fRealSpaceMatrix[l][1];
@ -777,7 +810,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {

  // Copy the results to the gradient matrix and restore the original aK-matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fOmegaDiffMatrix[1][l] = fRealSpaceMatrix[l][1];
@ -838,10 +871,16 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
    // 3D transform
    fftwf_execute(fFFTplan);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    // Copy the results to the gradient matrix - with the 1D-FORWARD-transform we have to _add_ fSumAk
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaDiffMatrix[2][k + NFFTz*index] = fRealSpaceMatrix[k + NFFTz*index][1] + fSumAk[k][1];
@ -850,7 +889,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {

    // Restore the original aK-matrix
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fFFTin[l][0] = fBkMatrix[l][1];
@ -859,7 +901,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGradient() const {
  } else {
    // For the 2D solution, dw/dz = 0
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fOmegaDiffMatrix[2][l] = 0.0;
@ -1004,9 +1046,15 @@ void TFilmTriVortexNGLFieldCalc::FillAbrikosovCoefficients(const float reducedFi

  fFFTin[0][0] = 0.0;

+  #ifdef HAVE_GOMP
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif
+
  for (k = 1; k < NFFTz; ++k) {
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+    #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
    #endif
    for (index = 0; index < NFFTsq; ++index) {
      fFFTin[k + NFFTz*index][0] = 0.0;
@ -1699,9 +1747,16 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsB() const {
    }
 */
  } else { // for 2D solution only
+
+    #ifdef HAVE_GOMP
+    int chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 1; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fBkMatrix[k + NFFTz*index][0] = 0.0;
@ -1971,10 +2026,15 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpXFirst() c

  int i, j, k, kx, ky, kz, index;
  float ii;
+  #ifdef HAVE_GOMP
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@ -2116,10 +2176,15 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpXSecond()

  int i, j, k, kx, ky, kz, index;
  float ii;
+  #ifdef HAVE_GOMP
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@ -2260,7 +2325,10 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpYFirst() c

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@ -2397,7 +2465,10 @@ void TFilmTriVortexNGLFieldCalc::ManipulateFourierCoefficientsForBperpYSecond()

  // k = 0
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  int chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBkMatrix[NFFTz*index][0] = 0.0f;
@ -2575,11 +2646,18 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  const int NFFTz_2(fStepsZ/2);
  const int NFFTsqStZ_2(NFFTsq*(fStepsZ/2 + 1));

+  #ifdef HAVE_GOMP
+  int chunk;
+  #endif
+
  // first check that the field is not larger than Hc2 and that we are dealing with a type II SC ...
  if ((field >= Hc2) || (lambda < xi/sqrt(2.0))) {
    int m;
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(m) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(m) schedule(dynamic,chunk)
    #endif
    for (m = 0; m < NFFTsqStZ; ++m) {
      fBout[0][m] = 0.0f;
@ -2601,7 +2679,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  for (k = 0; k < NFFTz; ++k) {
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(j,index) schedule(dynamic)
+    chunk = NFFT/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(j,index) schedule(dynamic,chunk)
    #endif
    for (j = 0; j < NFFT; ++j) {
      index = k + NFFTz*j;
@ -2626,7 +2707,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  for (k = 0; k < NFFTz; ++k) {
    for (j = 0; j < NFFT; ++j) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(i,index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(i,index) schedule(dynamic,chunk)
      #endif
      for (i = 0; i < NFFT; ++i) {
        index = k + NFFTz*(j + NFFT*i);
@ -2645,7 +2726,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  int indexQA;

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index,denomQAInv) schedule(dynamic)
+  chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index,denomQAInv) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    if (!fOmegaMatrix[NFFTz*index] || !index || (index == (NFFT+1)*NFFT_2)) {
@ -2672,7 +2756,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  for (k = 0; k < NFFTz; ++k) {
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+    #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
    #endif
    for (index = 0; index < NFFTsq; ++index) {
      fQMatrix[k + NFFTz*index][0] = fQMatrixA[index][0];
@ -2682,7 +2766,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // initialize the bK-Matrix
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBkMatrix[l][0] = 0.0;
@ -2708,7 +2795,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
 //       }

    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; l++) {
      if (fOmegaMatrix[l]) {
@ -2726,9 +2816,6 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // since all of this should be a smooth function anyway, I set the value of the next neighbour r
    // for the two vortex core positions in my matrix
    // If this was not enough we can get the g(0)-values by an expensive CalculateGatVortexCore()-invocation (not working at the moment)
-    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(k) schedule(dynamic)
-    #endif
    for (k = 0; k < NFFTz; ++k) {
      fRealSpaceMatrix[k][0] = fRealSpaceMatrix[k + fStepsZ*fSteps][0];//fGstorage[k];
      fRealSpaceMatrix[k + NFFTz*(NFFT+1)*NFFT_2][0] = fRealSpaceMatrix[k][0];//fGstorage[k];
@ -2747,9 +2834,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplan);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaMatrix[k + NFFTz*index] = fSumAk[k][0] - fRealSpaceMatrix[k + NFFTz*index][0];
@ -2794,7 +2887,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // Multiply the aK with the spacial averages
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fFFTin[k + NFFTz*index][0] = fFFTin[k + NFFTz*index][0]*fSumSum;
@ -2823,9 +2916,14 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    }

    if (!akConverged) {
+      #ifdef HAVE_GOMP
+      chunk = NFFT/omp_get_num_procs();
+      if (chunk < 10)
+        chunk = 10;
+      #endif
      for (k = 0; k < NFFTz; ++k) {
        #ifdef HAVE_GOMP
-        #pragma omp parallel for default(shared) private(j, index) schedule(dynamic)
+        #pragma omp parallel for default(shared) private(j, index) schedule(dynamic,chunk)
        #endif
        for (j = 0; j < NFFT; ++j) {
          index = k + NFFTz*j;
@ -2856,9 +2954,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplan);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fOmegaMatrix[k + NFFTz*index] = fSumAk[k][0] - fRealSpaceMatrix[k + NFFTz*index][0];
@ -2869,6 +2973,9 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // first calculate PK (use the Q-Matrix memory for the second part)
    #ifdef HAVE_GOMP
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
@ -2905,7 +3012,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    if (firstBkCalculation) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+      chunk = NFFTStZ/omp_get_num_procs();
+      if (chunk < 10)
+        chunk = 10;
+      #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
      #endif
      for (l = 0; l < NFFTStZ; ++l) {
        fCheckBkConvergence[l] = 0.0f;
@ -2937,9 +3047,14 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
    // cout << "Bk Convergence: " << bkConverged << endl;

    if (!bkConverged) {
+      #ifdef HAVE_GOMP
+      chunk = NFFT/omp_get_num_procs();
+      if (chunk < 10)
+        chunk = 10;
+      #endif
      for (k = 0; k < NFFTz; ++k) {
        #ifdef HAVE_GOMP
-        #pragma omp parallel for default(shared) private(j) schedule(dynamic)
+        #pragma omp parallel for default(shared) private(j) schedule(dynamic,chunk)
        #endif
        for (j = 0; j < NFFT; ++j) {
          index = k + NFFTz*j;
@ -2950,7 +3065,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // In order to save memory I will not allocate more space for another matrix but save a copy of the bKs in the aK-Matrix
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fFFTin[l][1] = fBkMatrix[l][0];
@ -2986,9 +3104,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplanBkToBandQ);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fQMatrix[k + NFFTz*index][0] = fQMatrixA[index][0] - fBkMatrix[k + NFFTz*index][1];
@ -2997,7 +3121,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // Restore bKs for Qy calculation and Fourier transform to get Qy
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fBkMatrix[l][0] = fFFTin[l][1];
@ -3008,9 +3135,15 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    fftwf_execute(fFFTplanBkToBandQ);

+    #ifdef HAVE_GOMP
+    chunk = NFFTsq/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #endif
+
    for (k = 0; k < NFFTz; ++k) {
      #ifdef HAVE_GOMP
-      #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
      #endif
      for (index = 0; index < NFFTsq; ++index) {
        fQMatrix[k + NFFTz*index][1] = fQMatrixA[index][1] + fBkMatrix[k + NFFTz*index][1];
@ -3019,7 +3152,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

    // Restore bKs for the next iteration
    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+    chunk = NFFTsqStZ/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
    #endif
    for (l = 0; l < NFFTsqStZ; ++l) {
      fBkMatrix[l][0] = fFFTin[l][1];
@ -3035,7 +3171,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Fill in the B-Matrix and restore the bKs for the second part of the Bx-calculation
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  chunk = NFFTsqStZ/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[0][l] = fBkMatrix[l][0];
@ -3050,7 +3189,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Fill in the B-Matrix and restore the bKs for the By-calculation

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[0][l] = 0.5f*(fBkMatrix[l][0] - fBout[0][l])*Hc2_kappa;
@ -3064,7 +3203,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Fill in the B-Matrix and restore the bKs for the second part of the By-calculation
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[1][l] = fBkMatrix[l][0];
@ -3078,7 +3217,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Fill in the B-Matrix and restore the bKs for the second part of the By-calculation
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[1][l] = 0.5f*(fBkMatrix[l][0] - fBout[1][l])*Hc2_kappa;
@ -3089,7 +3228,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  fftwf_execute(fFFTplanBkToBandQ);

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(l) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(l) schedule(dynamic,chunk)
  #endif
  for (l = 0; l < NFFTsqStZ; ++l) {
    fBout[2][l] = (scaledB + fBkMatrix[l][0])*Hc2_kappa;
@ -3099,7 +3238,10 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {

  // Save a copy of the BkS - where does not matter since this is the very end of the calculation...
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  chunk = NFFTsq/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fFFTin[index][1] = fBkS[index][0];
@ -3112,7 +3254,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Write the surface fields to the field-Matrix and restore the BkS for the By-calculation

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBout[0][NFFTz/2 + NFFTz*index] = fBkS[index][1]*Hc2_kappa;
@ -3127,7 +3269,7 @@ void TFilmTriVortexNGLFieldCalc::CalculateGrid() const {
  // Write the surface fields to the field-Matrix

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(index) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(index) schedule(dynamic,chunk)
  #endif
  for (index = 0; index < NFFTsq; ++index) {
    fBout[1][NFFTz/2 + NFFTz*index] = fBkS[index][1]*Hc2_kappa;
--- a/src/external/libFitPofB/classes/TPofBCalc.cpp
+++ b/src/external/libFitPofB/classes/TPofBCalc.cpp
@ -62,7 +62,10 @@ TPofBCalc::TPofBCalc(const vector<double> &para) : fBmin(0.0), fBmax(0.0), fDT(p
  int i;

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+  int chunk = fPBSize/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i) {
    fB[i] = static_cast<double>(i)*fDB;
@ -83,7 +86,10 @@ TPofBCalc::TPofBCalc(const vector<double>& b, const vector<double>& pb, double d
  int i;

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+  int chunk = fPBSize/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i) {
    fB[i] = b[i];
@ -121,9 +127,12 @@ TPofBCalc::TPofBCalc(const vector<double>& b, const vector<double>& pb, double d

 void TPofBCalc::UnsetPBExists() {
  int i;
-#ifdef HAVE_GOMP
-#pragma omp parallel for default(shared) private(i) schedule(dynamic)
-#endif
+  #ifdef HAVE_GOMP
+  int chunk = fPBSize/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
+  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i) {
    fPB[i] = 0.0;
  }
@ -140,14 +149,17 @@ void TPofBCalc::Normalize(unsigned int minFilledIndex = 0, unsigned int maxFille
    double pBsum(0.0);

    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(i) schedule(dynamic) reduction(+:pBsum)
+    int chunk = (maxFilledIndex-minFilledIndex)/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk) reduction(+:pBsum)
    #endif
    for (i = minFilledIndex; i <= static_cast<int>(maxFilledIndex); ++i)
      pBsum += fPB[i];
    pBsum *= fDB;

    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+    #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
    #endif
    for (i = minFilledIndex; i <= static_cast<int>(maxFilledIndex); ++i)
      fPB[i] /= pBsum;
@ -160,7 +172,10 @@ void TPofBCalc::SetPB(const vector<double> &pb) const {

  int i;
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+  int chunk = fPBSize/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i) {
    fPB[i] = pb[i];
@ -190,14 +205,20 @@ void TPofBCalc::Calculate(const string &type, const vector<double> &para) {
    int i;

    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+    int chunk = B0Index/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
    #endif
    for (i = 0; i < B0Index; ++i) {
      fPB[i] = exp(-(fB[i]-B0)*(fB[i]-B0)/expominus);
    }

    #ifdef HAVE_GOMP
-    #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+    chunk = (BmaxIndex-B0Index)/omp_get_num_procs();
+    if (chunk < 10)
+      chunk = 10;
+    #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
    #endif
    for (i = B0Index; i <= BmaxIndex; ++i) {
      fPB[i] = exp(-(fB[i]-B0)*(fB[i]-B0)/expoplus);
@ -546,6 +567,10 @@ void TPofBCalc::Calculate(const TBulkVortexFieldCalc *vortexLattice, const vecto
    // cannot use a reduction clause here (like e.g. in Normalize()), since pBvec[] is not a scalar variable
    // therefore, we need to work on it a bit more
    int n(omp_get_num_procs()), tid, offset;
+    int chunk = fPBSize/n;
+    if (chunk < 10)
+      chunk = 10;
+
    vector< vector<unsigned int> > pBvec(n, vector<unsigned int>(fPBSize, 0));

    int indexStep(static_cast<int>(floor(static_cast<float>(numberOfSteps_2)/static_cast<float>(n))));
@ -577,7 +602,7 @@ void TPofBCalc::Calculate(const TBulkVortexFieldCalc *vortexLattice, const vecto
    }

    for (j = 0; j < n; ++j) {
-      #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+      #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
      for (i = 0; i < static_cast<int>(fPBSize); ++i) {
        fPB[i] += static_cast<double>(pBvec[j][i]);
      }
@ -624,7 +649,10 @@ void TPofBCalc::AddBackground(double B, double s, double w) {
  // calculate Gaussian background
  double bg[fPBSize];
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+  int chunk = fPBSize/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
  #endif
  for(i = 0; i < static_cast<int>(fPBSize); ++i) {
    bg[i] = exp(-(fB[i]-B)*(fB[i]-B)/(2.0*BsSq));
@ -634,7 +662,7 @@ void TPofBCalc::AddBackground(double B, double s, double w) {

  double bgsum(0.0);
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic) reduction(+:bgsum)
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk) reduction(+:bgsum)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i)
    bgsum += bg[i];
@ -642,14 +670,14 @@ void TPofBCalc::AddBackground(double B, double s, double w) {
  bgsum *= fDB;

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i)
    bg[i] /= bgsum;

  // add background to P(B)
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic)
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i)
    fPB[i] = (1.0 - w)*fPB[i] + w*bg[i];
@ -691,7 +719,10 @@ void TPofBCalc::ConvolveGss(double w) {

  int i;
  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i, GssInTimeDomain) schedule(dynamic)
+  int chunk = (NFFT/2+1)/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i, GssInTimeDomain) schedule(dynamic,chunk)
  #endif
  for (i = 0; i < static_cast<int>(NFFT/2+1); ++i) {
    GssInTimeDomain = exp(expo*static_cast<double>(i)*static_cast<double>(i));
@ -725,7 +756,10 @@ double TPofBCalc::GetFirstMoment() const {
  double pBsum(0.0);

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i) schedule(dynamic) reduction(+:pBsum)
+  int chunk = fPBSize/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk) reduction(+:pBsum)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i)
    pBsum += fB[i]*fPB[i];
@ -746,7 +780,10 @@ double TPofBCalc::GetCentralMoment(unsigned int n) const {
  double pBsum(0.0);

  #ifdef HAVE_GOMP
-  #pragma omp parallel for default(shared) private(i, diff) schedule(dynamic) reduction(+:pBsum)
+  int chunk = fPBSize/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i, diff) schedule(dynamic,chunk) reduction(+:pBsum)
  #endif
  for (i = 0; i < static_cast<int>(fPBSize); ++i) {
    diff = fB[i]-firstMoment;
--- a/src/external/libFitPofB/classes/TPofTCalc.cpp
+++ b/src/external/libFitPofB/classes/TPofTCalc.cpp
@ -102,10 +102,13 @@ TPofTCalc::TPofTCalc (const TPofBCalc *PofB, const string &wisdom, const vector<

  int i;

-#ifdef HAVE_GOMP
-#pragma omp parallel for default(shared) private(i) schedule(dynamic)
-#endif
-  for (i = 0; i < NFFT_2p1; i++) {
+  #ifdef HAVE_GOMP
+  int chunk = NFFT_2p1/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
+  #endif
+  for (i = 0; i < NFFT_2p1; ++i) {
    fT[i] = static_cast<double>(i)*fTBin;
  }

@ -190,10 +193,13 @@ void TPofTCalc::CalcPol(const vector<double> &par) {
  double sinph(sin(par[0]*PI/180.0)), cosph(cos(par[0]*PI/180.0));
  int i;

-#ifdef HAVE_GOMP
-#pragma omp parallel for default(shared) private(i) schedule(dynamic)
-#endif
-  for (i=0; i<fNFFT/2+1; i++){
+  #ifdef HAVE_GOMP
+  int chunk = (fNFFT/2+1)/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #pragma omp parallel for default(shared) private(i) schedule(dynamic,chunk)
+  #endif
+  for (i=0; i<fNFFT/2+1; ++i) {
    fPT[i] = (cosph*fFFTout[i][0] + sinph*fFFTout[i][1])*par[2];
  }
 }
@ -240,7 +246,7 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>
  vector<double> N0;
  vector<double> bg;

-  for(unsigned int i(0); i<numHist; i++) {
+  for(unsigned int i(0); i<numHist; ++i) {
    t0.push_back(int(par[i+4+numHist*2]));
    asy0.push_back(par[i+4]);
    phase0.push_back(par[i+4+numHist]);
@ -258,15 +264,21 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>
  double ttime;
  int j,k;

-  for(unsigned int i(0); i<numHist; i++) {
+  #ifdef HAVE_GOMP
+  int chunk = nChannels/omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+  #endif
+
+  for(unsigned int i(0); i<numHist; ++i) {
    param[0]=phase0[i];
    // calculate asymmetry
    CalcPol(param);

-#ifdef HAVE_GOMP
-#pragma omp parallel for default(shared) private(j,ttime,k) schedule(dynamic)
-#endif
-    for(j=0; j<nChannels; j++) {
+    #ifdef HAVE_GOMP
+    #pragma omp parallel for default(shared) private(j,ttime,k) schedule(dynamic,chunk)
+    #endif
+    for(j=0; j<nChannels; ++j) {
      ttime=j*par[2];
      k = static_cast<int>(floor(ttime/fTBin));
      asydata[j]=asy0[i]*(fPT[k]+(fPT[k+1]-fPT[k])/fTBin*(ttime-fT[k]));
@ -289,12 +301,12 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>
  vector< vector<double> > histo;
  vector<double> data(nChannels);

-  for (unsigned int i(0); i<numHist; i++) {    // loop over all histos
+  for (unsigned int i(0); i<numHist; ++i) {    // loop over all histos

-#ifdef HAVE_GOMP
-#pragma omp parallel for default(shared) private(j) schedule(dynamic)
-#endif
-    for (j = 0; j<nChannels; j++) { // loop over time
+    #ifdef HAVE_GOMP
+    #pragma omp parallel for default(shared) private(j) schedule(dynamic,chunk)
+    #endif
+    for (j = 0; j<nChannels; ++j) { // loop over time
      if (j < t0[i]) // j<t0
        data[j] = bg[i]; // background
      else
@ -315,7 +327,7 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>
  vector<TH1F*> histoData;

  TString name;
-  for (unsigned int i(0); i<numHist; i++) { // loop over all histos
+  for (unsigned int i(0); i<numHist; ++i) { // loop over all histos
    // create histos
    name   = "theoHisto";
    name  += i;
@ -327,10 +339,10 @@ void TPofTCalc::FakeData(const string &rootOutputFileName, const vector<double>
    name  += i;
    fakeHisto = new TH1F(name.Data(), name.Data(), int(par[3]), -par[2]/2.0, (par[3]+0.5)*par[2]);
    // fill theoHisto
-#ifdef HAVE_GOMP
-#pragma omp parallel for default(shared) private(j) schedule(dynamic)
-#endif
-    for (j = 0; j<nChannels; j++)
+    #ifdef HAVE_GOMP
+    #pragma omp parallel for default(shared) private(j) schedule(dynamic,chunk)
+    #endif
+    for (j = 0; j<nChannels; ++j)
      theoHisto->SetBinContent(j, histo[i][j]);
 // end omp