Merge release-5-0 into master

author Mark Abraham <mark.j.abraham@gmail.com>

Wed, 1 Oct 2014 20:02:02 +0000 (22:02 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 1 Oct 2014 21:18:54 +0000 (23:18 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Wed, 1 Oct 2014 20:02:02 +0000 (22:02 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 1 Oct 2014 21:18:54 +0000 (23:18 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 70185949c4ac8cae13677e083501bd2f14c71124..7fbe947522f707b751ec25248446b94e39af19e4 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,7 +81,7 @@ if(CMAKE_CONFIGURATION_TYPES)
          "List of configuration types"
          FORCE)
  endif()
-set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
+set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
  
  set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
  
diff --git a/admin/uncrustify.cfg b/admin/uncrustify.cfg

index d89351a7e60933183c0ceed7485b270a0e11020d..f7308115b2443d4354ac69a1f903592020f16034 100644 (file)
--- a/admin/uncrustify.cfg
+++ b/admin/uncrustify.cfg
@@ -1575,3 +1575,12 @@ pp_define_at_level                       = false    # false/true
  # all tokens are separated by any mix of ',' commas, '=' equal signs
  # and whitespace (space, tab)
  #
+
+# Teach uncrustify about the GROMACS attribute aliases that we use
+# to hide compiler differences. This means that declarations like
+#
+# int i, j;
+# int nthreads gmx_unused;
+#
+# does not align i with gmx_unused.
+set ATTRIBUTE gmx_unused gmx_inline gmx_restrict
diff --git a/cmake/gmxCFlags.cmake b/cmake/gmxCFlags.cmake

index 011cada36e16dfdbe6eb9008041156c85e4c070d..3a0e3bbe15428b144c8fc1510112d46ee442b4a3 100644 (file)
--- a/cmake/gmxCFlags.cmake
+++ b/cmake/gmxCFlags.cmake
@@ -64,7 +64,7 @@ function(gmx_set_cmake_compiler_flags)
          # be set up elsewhere and passed to this function, but it is
          # inconvenient in CMake to pass more than one list, and such a
          # list is only used here.
-        foreach(build_type RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
+        foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
              set(GMXC_${language}FLAGS_${build_type} "${GMXC_${language}FLAGS_RELEASE}")
          endforeach()
          # Copy the flags that are only used by the real Release build
@@ -85,9 +85,12 @@ function(gmx_set_cmake_compiler_flags)
              endif()
  
              # Append to the variables for the given build type for
-            # each language, in the parent scope.
+            # each language, in the parent scope. We add our new variables at the end, so
+            # compiler-specific choices are more likely to override default CMake choices.
+            # This is for instance useful for RelWithDebInfo builds, where we want to use the full
+            # set of our optimization flags detected in this file, rather than having -O2 override them.
              set(CMAKE_${language}_FLAGS${punctuation}${build_type}
-                "${GMXC_${language}FLAGS${punctuation}${build_type}} ${CMAKE_${language}_FLAGS${punctuation}${build_type}}"
+                "${CMAKE_${language}_FLAGS${punctuation}${build_type}} ${GMXC_${language}FLAGS${punctuation}${build_type}}"
                  PARENT_SCOPE)
          endforeach()
      endforeach()
diff --git a/cmake/gmxTestCompilerProblems.cmake b/cmake/gmxTestCompilerProblems.cmake

index e534730013ce567857deb82e2bb5919e86f545f5..a8cdb870df10e7f0cfbebe22e6928e79ef0f41f7 100644 (file)
--- a/cmake/gmxTestCompilerProblems.cmake
+++ b/cmake/gmxTestCompilerProblems.cmake
@@ -80,8 +80,11 @@ macro(gmx_test_compiler_problems)
          message(WARNING "All tested PGI compiler versions (up to 12.9.0) generate binaries which produce incorrect results, or even fail to  compile Gromacs. Highly recommended to use a different compiler. If you choose to use PGI, make sure to run the regressiontests.")
      endif()
  
-    if(CMAKE_COMPILER_IS_GNUCC AND WIN32 AND (GMX_SIMD STREQUAL "AVX_256" OR GMX_SIMD STREQUAL "AVX2_256"))
-        message(WARNING "GCC on Windows with AVX crashes. Choose SSE4_1 or a different compiler.") # GCC bug 49001.
+    if(CMAKE_COMPILER_IS_GNUCC AND
+            (CMAKE_C_COMPILER_VERSION VERSION_LESS "4.9.0" OR CMAKE_SIZEOF_VOID_P EQUAL 8)
+            AND (WIN32 OR CYGWIN)
+            AND GMX_SIMD MATCHES "AVX" AND NOT GMX_SIMD STREQUAL AVX_128_FMA)
+        message(WARNING "GCC on Windows (GCC older than 4.9 or any version when compiling for 64bit) with AVX (other than AVX_128_FMA) crashes. Choose a different GMX_SIMD or a different compiler.") # GCC bug 49001, 54412.
      endif()
  
      if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND WIN32 AND NOT CYGWIN)
diff --git a/src/gromacs/gmxana/gmx_tune_pme.c b/src/gromacs/gmxana/gmx_tune_pme.c

index b9c2af3e717863e654c243f8cd299c582023a742..c4a7e6e021b4769d6c1e6063fc8d48f38eb8c13f 100644 (file)
--- a/src/gromacs/gmxana/gmx_tune_pme.c
+++ b/src/gromacs/gmxana/gmx_tune_pme.c
@@ -1308,9 +1308,12 @@ static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp,
      {
          /* To prevent confusion, do not again issue a gmx_fatal here since we already
           * get the error message from mdrun itself */
-        sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
+        sprintf(msg,
+                "Cannot run the first benchmark simulation! Please check the error message of\n"
                  "mdrun for the source of the problem. Did you provide a command line\n"
-                "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
+                "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
+                "sure your command line should work, you can bypass this check with \n"
+                "gmx tune_pme -nocheck. The failing command was:\n"
                  "\n%s\n\n", command);
  
          fprintf(stderr, "%s", msg);
@@ -1341,7 +1344,7 @@ static void do_the_tests(
          int             npme_fixed,     /* If >= -1, test fixed number of PME
                                           * nodes only                             */
          const char     *npmevalues_opt, /* Which -npme values should be tested    */
-        t_perf        **perfdata,       /* Here the performace data is stored     */
+        t_perf        **perfdata,       /* Here the performance data is stored    */
          int            *pmeentries,     /* Entries in the nPMEnodes list          */
          int             repeats,        /* Repeat each test this often            */
          int             nnodes,         /* Total number of nodes = nPP + nPME     */
@@ -1354,7 +1357,8 @@ static void do_the_tests(
          const t_filenm *fnm,            /* List of filenames from command line    */
          int             nfile,          /* Number of files specified on the cmdl. */
          int             presteps,       /* DLB equilibration steps, is checked    */
-        gmx_int64_t     cpt_steps)      /* Time step counter in the checkpoint    */
+        gmx_int64_t     cpt_steps,      /* Time step counter in the checkpoint    */
+        gmx_bool        bCheck)         /* Check whether benchmark mdrun works    */
  {
      int      i, nr, k, ret, count = 0, totaltests;
      int     *nPMEnodes = NULL;
@@ -1454,8 +1458,11 @@ static void do_the_tests(
                          cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
  
                  /* To prevent that all benchmarks fail due to a show-stopper argument
-                 * on the mdrun command line, we make a quick check first */
-                if (bFirst)
+                 * on the mdrun command line, we make a quick check first.
+                 * This check can be turned off in cases where the automatically chosen
+                 * number of PME-only ranks leads to a number of PP ranks for which no
+                 * decomposition can be found (e.g. for large prime numbers) */
+                if (bFirst && bCheck)
                  {
                      make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
                  }
@@ -2016,6 +2023,9 @@ int gmx_tune_pme(int argc, char *argv[])
          "need to provide a machine- or hostfile. This can also be passed",
          "via the MPIRUN variable, e.g.[PAR]",
          "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
+        "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
+        "check whether mdrun works as expected with the provided parallel settings",
+        "if the [TT]-check[tt] option is activated (the default).",
          "Please call [THISMODULE] with the normal options you would pass to",
          "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
          "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
@@ -2025,7 +2035,7 @@ int gmx_tune_pme(int argc, char *argv[])
          "written with enlarged cutoffs and smaller Fourier grids respectively.",
          "Typically, the first test (number 0) will be with the settings from the input",
          "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
-        "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
+        "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
          "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
          "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
          "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
@@ -2035,7 +2045,7 @@ int gmx_tune_pme(int argc, char *argv[])
          "MD systems. The dynamic load balancing needs about 100 time steps",
          "to adapt to local load imbalances, therefore the time step counters",
          "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
-        "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
+        "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
          "From the 'DD' load imbalance entries in the md.log output file you",
          "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
          "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
@@ -2162,6 +2172,7 @@ int gmx_tune_pme(int argc, char *argv[])
      gmx_bool     bKeepAndNumCPT        = FALSE;
      gmx_bool     bResetCountersHalfWay = FALSE;
      gmx_bool     bBenchmark            = TRUE;
+    gmx_bool     bCheck                = TRUE;
  
      output_env_t oenv = NULL;
  
@@ -2205,6 +2216,8 @@ int gmx_tune_pme(int argc, char *argv[])
            "Launch the real simulation after optimization" },
          { "-bench",    FALSE, etBOOL, {&bBenchmark},
            "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
+        { "-check",    FALSE, etBOOL, {&bCheck},
+          "Before the benchmark runs, check whether mdrun works in parallel" },
          /******************/
          /* mdrun options: */
          /******************/
@@ -2457,7 +2470,7 @@ int gmx_tune_pme(int argc, char *argv[])
      {
          do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
                       repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
-                     cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
+                     cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck);
  
          fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
  
diff --git a/src/gromacs/gmxlib/calcgrid.c b/src/gromacs/gmxlib/calcgrid.c

index 1de1f57efed3998ac8ce36a36d0b67c833dee457..7bbee76f1260ad0a81493ba9ee3ed1e0f5bf7eb9 100644 (file)
--- a/src/gromacs/gmxlib/calcgrid.c
+++ b/src/gromacs/gmxlib/calcgrid.c
@@ -136,7 +136,7 @@ real calc_grid(FILE *fp, matrix box, real gr_sp,
                  /* Determine how many pre-factors of 2 we need */
                  fac2 = 1;
                  i    = g_baseNR - 1;
-                while (fac2*grid_base[i-1] < nmin)
+                while (fac2*grid_base[i] < nmin)
                  {
                      fac2 *= 2;
                  }
diff --git a/src/gromacs/gmxpreprocess/toppush.c b/src/gromacs/gmxpreprocess/toppush.c

index 161940b72e0f2a73267baeab449016d12247b67d..8272b7787ecf3f2dad111040e9d2ca76dca6b884 100644 (file)
--- a/src/gromacs/gmxpreprocess/toppush.c
+++ b/src/gromacs/gmxpreprocess/toppush.c
@@ -599,9 +599,11 @@ static void push_bondtype(t_params     *       bt,
                      {
                          sprintf(errbuf, "Overriding %s parameters.%s",
                                  interaction_function[ftype].longname,
-                                (ftype == F_PDIHS) ? "\nUse dihedraltype 4 to allow several multiplicity terms." : "");
+                                (ftype == F_PDIHS) ?
+                                "\nUse dihedraltype 9 to allow several multiplicity terms. Only consecutive lines are combined. Non-consective lines overwrite each other."
+                                : "");
                          warning(wi, errbuf);
-                        fprintf(stderr, "  old:");
+                        fprintf(stderr, "  old:                                         ");
                          for (j = 0; (j < nrfp); j++)
                          {
                              fprintf(stderr, " %g", bt->param[i].c[j]);
diff --git a/src/gromacs/legacyheaders/domdec.h b/src/gromacs/legacyheaders/domdec.h

index 13b55711e8c385ee03d41139beb3563cc684c4cb..c44c7bf8406784fdd4ee17326d357a3266c521d4 100644 (file)
--- a/src/gromacs/legacyheaders/domdec.h
+++ b/src/gromacs/legacyheaders/domdec.h
@@ -139,6 +139,12 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr);
   * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
   */
  
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd);
+/* Return if the DLB lock is set */
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue);
+/* Set a lock such that with DLB=auto DLB can (not) get turned on */
+
  void dd_setup_dlb_resource_sharing(t_commrec           *cr,
                                     const gmx_hw_info_t *hwinfo,
                                     const gmx_hw_opt_t  *hw_opt);
diff --git a/src/gromacs/listed-forces/bonded.cpp b/src/gromacs/listed-forces/bonded.cpp

index b1ded380a424d9dcc554099fae8883af00314e12..dfb846d9e700d1492bd6314d68537a9e21fb8860 100644 (file)
--- a/src/gromacs/listed-forces/bonded.cpp
+++ b/src/gromacs/listed-forces/bonded.cpp
@@ -2117,6 +2117,152 @@ pdihs_noener_simd(int nbonds,
      }
  }
  
+/* This is mostly a copy of pdihs_noener_simd above, but with using
+ * the RB potential instead of a harmonic potential.
+ * This function can replace rbdihs() when no energy and virial are needed.
+ */
+static void
+rbdihs_noener_simd(int nbonds,
+                   const t_iatom forceatoms[], const t_iparams forceparams[],
+                   const rvec x[], rvec f[],
+                   const t_pbc *pbc, const t_graph gmx_unused *g,
+                   real gmx_unused lambda,
+                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+                   int gmx_unused *global_atom_index)
+{
+    const int             nfa1 = 5;
+    int                   i, iu, s, j;
+    int                   type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+    real                  dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+    real                  buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+    real                 *parm, *p, *q;
+
+    gmx_simd_real_t       phi_S;
+    gmx_simd_real_t       ddphi_S, cosfac_S;
+    gmx_simd_real_t       mx_S, my_S, mz_S;
+    gmx_simd_real_t       nx_S, ny_S, nz_S;
+    gmx_simd_real_t       nrkj_m2_S, nrkj_n2_S;
+    gmx_simd_real_t       parm_S, c_S;
+    gmx_simd_real_t       sin_S, cos_S;
+    gmx_simd_real_t       sf_i_S, msf_l_S;
+    pbc_simd_t            pbc_simd;
+
+    gmx_simd_real_t       pi_S  = gmx_simd_set1_r(M_PI);
+    gmx_simd_real_t       one_S = gmx_simd_set1_r(1.0);
+
+    /* Ensure SIMD register alignment */
+    dr  = gmx_simd_align_r(dr_array);
+    buf = gmx_simd_align_r(buf_array);
+
+    /* Extract aligned pointer for parameters and variables */
+    parm  = buf;
+    p     = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+    q     = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
+
+    set_pbc_simd(pbc, &pbc_simd);
+
+    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+    {
+        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+         */
+        iu = i;
+        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+        {
+            type  = forceatoms[iu];
+            ai[s] = forceatoms[iu+1];
+            aj[s] = forceatoms[iu+2];
+            ak[s] = forceatoms[iu+3];
+            al[s] = forceatoms[iu+4];
+
+            /* We don't need the first parameter, since that's a constant
+             * which only affects the energies, not the forces.
+             */
+            for (j = 1; j < NR_RBDIHS; j++)
+            {
+                parm[j*GMX_SIMD_REAL_WIDTH + s] =
+                    forceparams[type].rbdihs.rbcA[j];
+            }
+
+            /* At the end fill the arrays with identical entries */
+            if (iu + nfa1 < nbonds)
+            {
+                iu += nfa1;
+            }
+        }
+
+        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+        dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+                       dr,
+                       &phi_S,
+                       &mx_S, &my_S, &mz_S,
+                       &nx_S, &ny_S, &nz_S,
+                       &nrkj_m2_S,
+                       &nrkj_n2_S,
+                       p, q);
+
+        /* Change to polymer convention */
+        phi_S = gmx_simd_sub_r(phi_S, pi_S);
+
+        gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+
+        ddphi_S   = gmx_simd_setzero_r();
+        c_S       = one_S;
+        cosfac_S  = one_S;
+        for (j = 1; j < NR_RBDIHS; j++)
+        {
+            parm_S   = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+            ddphi_S  = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+            cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+            c_S      = gmx_simd_add_r(c_S, one_S);
+        }
+
+        /* Note that here we do not use the minus sign which is present
+         * in the normal RB code. This is corrected for through (m)sf below.
+         */
+        ddphi_S  = gmx_simd_mul_r(ddphi_S, sin_S);
+
+        sf_i_S   = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+        msf_l_S  = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+
+        /* After this m?_S will contain f[i] */
+        mx_S     = gmx_simd_mul_r(sf_i_S, mx_S);
+        my_S     = gmx_simd_mul_r(sf_i_S, my_S);
+        mz_S     = gmx_simd_mul_r(sf_i_S, mz_S);
+
+        /* After this m?_S will contain -f[l] */
+        nx_S     = gmx_simd_mul_r(msf_l_S, nx_S);
+        ny_S     = gmx_simd_mul_r(msf_l_S, ny_S);
+        nz_S     = gmx_simd_mul_r(msf_l_S, nz_S);
+
+        gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+        gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+        gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+        gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+        gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+        gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+
+        iu = i;
+        s  = 0;
+        do
+        {
+            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+                                        p[s], q[s],
+                                        dr[     XX *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     YY *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[     ZZ *GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+                                        dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+                                        f);
+            s++;
+            iu += nfa1;
+        }
+        while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+    }
+}
+
  #endif /* GMX_SIMD_HAVE_REAL */
  
  
@@ -4405,6 +4551,19 @@ static real calc_one_bond(int thread,
                  global_atom_index);
              v = 0;
          }
+#ifdef GMX_SIMD_HAVE_REAL
+        else if (ftype == F_RBDIHS &&
+                 !bCalcEnerVir && fr->efep == efepNO)
+        {
+            /* No energies, shift forces, dvdl */
+            rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                               idef->iparams,
+                               (const rvec*)x, f,
+                               pbc, g, lambda[efptFTYPE], md, fcd,
+                               global_atom_index);
+            v = 0;
+        }
+#endif
          else
          {
              v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
diff --git a/src/gromacs/mdlib/domdec.cpp b/src/gromacs/mdlib/domdec.cpp

index 13934d5fa68fd157027fac7fc3d4cb8b84662b6a..06f2602916cdedd2b27728612b90bd533f09eadb 100644 (file)
--- a/src/gromacs/mdlib/domdec.cpp
+++ b/src/gromacs/mdlib/domdec.cpp
@@ -275,6 +275,8 @@ typedef struct gmx_domdec_comm
  
      /* The DLB option */
      int      eDLB;
+    /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+    gmx_bool bDLB_locked;
      /* Are we actually using DLB? */
      gmx_bool bDynLoadBal;
  
@@ -391,9 +393,9 @@ typedef struct gmx_domdec_comm
      int    eFlop;
      double flop;
      int    flop_n;
-    /* Have often have did we have load measurements */
+    /* How many times have did we have load measurements */
      int    n_load_have;
-    /* Have often have we collected the load measurements */
+    /* How many times have we collected the load measurements */
      int    n_load_collect;
  
      /* Statistics */
@@ -3465,7 +3467,7 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
              cell_size[i] = 1.0/ncd;
          }
      }
-    else if (dd_load_count(comm))
+    else if (dd_load_count(comm) > 0)
      {
          load_aver  = comm->load[d].sum_m/ncd;
          change_max = 0;
@@ -4336,7 +4338,7 @@ static void clear_and_mark_ind(int ncg, int *move,
  static void print_cg_move(FILE *fplog,
                            gmx_domdec_t *dd,
                            gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveLimitdAndCMOld, real limitd,
+                          gmx_bool bHaveCgcmOld, real limitd,
                            rvec cm_old, rvec cm_new, real pos_d)
  {
      gmx_domdec_comm_t *comm;
@@ -4345,19 +4347,22 @@ static void print_cg_move(FILE *fplog,
      comm = dd->comm;
  
      fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
-    if (bHaveLimitdAndCMOld)
+    if (limitd > 0)
      {
-        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                  ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
      }
      else
      {
-        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
+        /* We don't have a limiting distance available: don't print it */
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                  ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
      }
      fprintf(fplog, "distance out of cell %f\n",
              dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
-    if (bHaveLimitdAndCMOld)
+    if (bHaveCgcmOld)
      {
          fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
                  cm_old[XX], cm_old[YY], cm_old[ZZ]);
@@ -4375,19 +4380,20 @@ static void print_cg_move(FILE *fplog,
  static void cg_move_error(FILE *fplog,
                            gmx_domdec_t *dd,
                            gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveLimitdAndCMOld, real limitd,
+                          gmx_bool bHaveCgcmOld, real limitd,
                            rvec cm_old, rvec cm_new, real pos_d)
  {
      if (fplog)
      {
          print_cg_move(fplog, dd, step, cg, dim, dir,
-                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                      bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
      }
      print_cg_move(stderr, dd, step, cg, dim, dir,
-                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                  bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
      gmx_fatal(FARGS,
-              "A charge group moved too far between two domain decomposition steps\n"
-              "This usually means that your system is not well equilibrated");
+              "%s moved too far between two domain decomposition steps\n"
+              "This usually means that your system is not well equilibrated",
+              dd->comm->bCGs ? "A charge group" : "An atom");
  }
  
  static void rotate_state_atom(t_state *state, int a)
@@ -4509,7 +4515,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step,
                  {
                      if (pos_d >= limit1[d])
                      {
-                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
+                        cg_move_error(fplog, dd, step, cg, d, 1,
+                                      cg_cm != state->x, limitd[d],
                                        cg_cm[cg], cm_new, pos_d);
                      }
                      dev[d] = 1;
@@ -4535,7 +4542,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step,
                  {
                      if (pos_d < limit0[d])
                      {
-                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
+                        cg_move_error(fplog, dd, step, cg, d, -1,
+                                      cg_cm != state->x, limitd[d],
                                        cg_cm[cg], cm_new, pos_d);
                      }
                      dev[d] = -1;
@@ -4949,7 +4957,7 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
                  {
                      cg_move_error(fplog, dd, step, cg, dim,
                                    (flag & DD_FLAG_FW(d)) ? 1 : 0,
-                                  FALSE, 0,
+                                  fr->cutoff_scheme == ecutsGROUP, 0,
                                    comm->vbuf.v[buf_pos],
                                    comm->vbuf.v[buf_pos],
                                    comm->vbuf.v[buf_pos][dim]);
@@ -6673,7 +6681,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      /* Initialize to GPU share count to 0, might change later */
      comm->nrank_gpu_shared = 0;
  
-    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->bDLB_locked = FALSE;
  
      comm->bDynLoadBal = (comm->eDLB == edlbYES);
      if (fplog)
@@ -6739,6 +6748,13 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      comm->cellsize_limit = 0;
      comm->bBondComm      = FALSE;
  
+    /* Atoms should be able to move by up to half the list buffer size (if > 0)
+     * within nstlist steps. Since boundaries are allowed to displace by half
+     * a cell size, DD cells should be at least the size of the list buffer.
+     */
+    comm->cellsize_limit = std::max(comm->cellsize_limit,
+                                    ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
+
      if (comm->bInterCGBondeds)
      {
          if (comm_distance_min > 0)
@@ -7564,6 +7580,20 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr)
      comm->PMELoadBal_max_cutoff = comm->cutoff;
  }
  
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+{
+    return dd->comm->bDLB_locked;
+}
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+{
+    /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+    if (dd->comm->eDLB == edlbAUTO)
+    {
+        dd->comm->bDLB_locked = bValue;
+    }
+}
+
  static void merge_cg_buffers(int ncell,
                               gmx_domdec_comm_dim_t *cd, int pulse,
                               int  *ncg_cell,
@@ -9324,17 +9354,17 @@ void dd_partition_system(FILE                *fplog,
      }
  
      /* Check if we have recorded loads on the nodes */
-    if (comm->bRecordLoad && dd_load_count(comm))
+    if (comm->bRecordLoad && dd_load_count(comm) > 0)
      {
-        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
          {
              /* Check if we should use DLB at the second partitioning
               * and every 100 partitionings,
               * so the extra communication cost is negligible.
               */
-            n         = std::max(100, nstglobalcomm);
+            const int nddp_chk_dlb = 100;
              bCheckDLB = (comm->n_load_collect == 0 ||
-                         comm->n_load_have % n == n-1);
+                         comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
          }
          else
          {
@@ -9372,8 +9402,26 @@ void dd_partition_system(FILE                *fplog,
                  /* Since the timings are node dependent, the master decides */
                  if (DDMASTER(dd))
                  {
-                    bTurnOnDLB =
-                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    /* Here we check if the max PME rank load is more than 0.98
+                     * the max PP force load. If so, PP DLB will not help,
+                     * since we are (almost) limited by PME. Furthermore,
+                     * DLB will cause a significant extra x/f redistribution
+                     * cost on the PME ranks, which will then surely result
+                     * in lower total performance.
+                     * This check might be fragile, since one measurement
+                     * below 0.98 (although only done once every 100 DD part.)
+                     * could turn on DLB for the rest of the run.
+                     */
+                    if (cr->npmenodes > 0 &&
+                        dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                    {
+                        bTurnOnDLB = FALSE;
+                    }
+                    else
+                    {
+                        bTurnOnDLB =
+                            (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    }
                      if (debug)
                      {
                          fprintf(debug, "step %s, imb loss %f\n",
diff --git a/src/gromacs/mdlib/force.c b/src/gromacs/mdlib/force.c

index f98b073fee59645acfddc2992220bea2377dd2a3..4313d3e75ef729035af5e0600472183e92fadeb8 100644 (file)
--- a/src/gromacs/mdlib/force.c
+++ b/src/gromacs/mdlib/force.c
@@ -118,9 +118,11 @@ static void reduce_thread_forces(int n, rvec *f,
                                   int nthreads, f_thread_t *f_t)
  {
      int t, i;
+    int nthreads_loop gmx_unused;
  
      /* This reduction can run over any number of threads */
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
      for (i = 0; i < n; i++)
      {
          for (t = 1; t < nthreads; t++)
diff --git a/src/gromacs/mdlib/mdatom.c b/src/gromacs/mdlib/mdatom.c

index 01c5c55bd6140a5453f8ee24eac3a13f80e0e70a..d1722dedf172989c6616e38b7ff59f2d8746c115 100644 (file)
--- a/src/gromacs/mdlib/mdatom.c
+++ b/src/gromacs/mdlib/mdatom.c
@@ -118,6 +118,7 @@ void atoms2md(gmx_mtop_t *mtop, t_inputrec *ir,
      t_grpopts            *opts;
      gmx_groups_t         *groups;
      gmx_molblock_t       *molblock;
+    int                   nthreads gmx_unused;
  
      bLJPME = EVDW_PME(ir->vdwtype);
  
@@ -230,7 +231,8 @@ void atoms2md(gmx_mtop_t *mtop, t_inputrec *ir,
  
      alook = gmx_mtop_atomlookup_init(mtop);
  
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static)
+    nthreads = gmx_omp_nthreads_get(emntDefault);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
      for (i = 0; i < md->nr; i++)
      {
          int      g, ag, molb;
diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c

index 234bcbdd8fc68df3abae1445a5c266ba73d8a294..5255d544d146f0fa3cea26efe922ef5f8665a302 100644 (file)
--- a/src/gromacs/mdlib/minimize.c
+++ b/src/gromacs/mdlib/minimize.c
@@ -550,6 +550,7 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
      int      start, end;
      rvec    *x1, *x2;
      real     dvdl_constr;
+    int      nthreads gmx_unused;
  
      s1 = &ems1->s;
      s2 = &ems2->s;
@@ -587,7 +588,8 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
      x1 = s1->x;
      x2 = s2->x;
  
-#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+    nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
      {
          int gf, i, m;
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 799425edab3121ddcd2a131f0a92621ee6482bfc..c024f7b02c628ea814e30ed38c23212fbde6aba4 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -212,7 +212,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
      nbnxn_cu_kfunc_ptr_t res;
  
      assert(eeltype < eelCuNR);
-    assert(evdwtype < eelCuNR);
+    assert(evdwtype < evdwCuNR);
  
      if (bDoEne)
      {
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh

index c468306899094a050f87816c90cb0fe4087de5a0..3b37869e6dda00b17ae3f2fa4bced6347a271587 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -157,7 +157,7 @@
  /* Analytical Ewald interaction kernels with twin-range cut-off
   */
  #define EL_EWALD_ANA
-#define LJ_CUTOFF_CHECK
+#define VDW_CUTOFF_CHECK
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
@@ -189,7 +189,7 @@
  #undef NB_KERNEL_FUNC_NAME
  
  #undef EL_EWALD_ANA
-#undef LJ_CUTOFF_CHECK
+#undef VDW_CUTOFF_CHECK
  
  
  /* Tabulated Ewald interaction kernels */
@@ -229,7 +229,7 @@
  
  /* Tabulated Ewald interaction kernels with twin-range cut-off */
  #define EL_EWALD_TAB
-#define LJ_CUTOFF_CHECK
+#define VDW_CUTOFF_CHECK
  
  /* cut-off + V shift LJ */
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
@@ -261,4 +261,4 @@
  #undef NB_KERNEL_FUNC_NAME
  
  #undef EL_EWALD_TAB
-#undef LJ_CUTOFF_CHECK
+#undef VDW_CUTOFF_CHECK
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre

index 9432b2f3119546932bf371d2805ec4e9343740e6..5e410cdecc4525f96fa7ab74c4a50c848b203861 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre
@@ -139,6 +139,7 @@ void
      nbnxn_pairlist_t **nbl;
      int                coulkt, vdwkt = 0;
      int                nb;
+    int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@ -210,7 +211,8 @@ void
          gmx_incons("Unsupported VdW interaction type");
      }}
  
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {{
          nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c

index 3db03bbec59c7f76f080ef9322b77f3cf2ac3cdc..4cad2a2f5637ba5b410af8db8ca1225cc441d1a6 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
@@ -185,6 +185,7 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
      int                coult;
      int                vdwt;
      int                nb;
+    int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@ -242,7 +243,8 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
          gmx_incons("Unsupported vdwtype in nbnxn reference kernel");
      }
  
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {
          nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c

index ed1456873bcf373c6844295b7b286eb79e8a5577..1d8567b206d4619eab232683f732d56f20b71010 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
@@ -274,6 +274,7 @@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
      nbnxn_pairlist_t **nbl;
      int                coulkt, vdwkt = 0;
      int                nb;
+    int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@ -345,7 +346,8 @@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
          gmx_incons("Unsupported VdW interaction type");
      }
  
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {
          nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c

index d17ba1e7bdbb1c328aa810bf3e268ed1c98d62fb..3531299efd5feeda4bf72de911292bee47805745 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
@@ -273,6 +273,7 @@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
      nbnxn_pairlist_t **nbl;
      int                coulkt, vdwkt = 0;
      int                nb;
+    int                nthreads gmx_unused;
  
      nnbl = nbl_list->nnbl;
      nbl  = nbl_list->nbl;
@@ -344,7 +345,8 @@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
          gmx_incons("Unsupported VdW interaction type");
      }
  
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
      for (nb = 0; nb < nnbl; nb++)
      {
          nbnxn_atomdata_output_t *out;
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c

index f13de8c7db485633350dd28414c1d23bb53eb237..56e616967c3f91d482408d2e4806c5f1f4fb4c37 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -1931,6 +1931,7 @@ void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
      float        *bbcz;
      nbnxn_bb_t   *bb;
      int           ncd, sc;
+    int           nthreads gmx_unused;
  
      grid = &nbs->grid[0];
  
@@ -1957,7 +1958,8 @@ void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
      bbcz = grid->bbcz_simple;
      bb   = grid->bb_simple;
  
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+    nthreads = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
      for (sc = 0; sc < grid->nc; sc++)
      {
          int c, tx, na;
@@ -4473,6 +4475,7 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
  {
      int nsci, ncj4, nexcl;
      int n, i;
+    int nthreads gmx_unused;
  
      if (nblc->bSimple)
      {
@@ -4513,7 +4516,8 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
      /* Each thread should copy its own data to the combined arrays,
       * as otherwise data will go back and forth between different caches.
       */
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+    nthreads = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
      for (n = 0; n < nnbl; n++)
      {
          int                     sci_offset;
diff --git a/src/gromacs/mdlib/ns.c b/src/gromacs/mdlib/ns.c

index 9145929c475b11a0d5bda33347479803215ca818..3a1396cd40ac57bec8a893dedd78d758d470e6dc 100644 (file)
--- a/src/gromacs/mdlib/ns.c
+++ b/src/gromacs/mdlib/ns.c
@@ -427,6 +427,14 @@ static gmx_inline void close_i_nblist(t_nblist *nlist)
          nlist->jindex[nri+1] = nlist->nrj;
  
          len = nlist->nrj -  nlist->jindex[nri];
+        /* If there are no j-particles we have to reduce the
+         * number of i-particles again, to prevent errors in the
+         * kernel functions.
+         */
+        if ((len == 0) && (nlist->nri > 0))
+        {
+            nlist->nri--;
+        }
      }
  }
  
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c

index e6686abd02d1ca3ea154f0670dafc479b2724554..b35dd827ac0afe009787c7e562d0baebeb264322 100644 (file)
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -1399,6 +1399,16 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (bDoForces && DOMAINDECOMP(cr))
      {
+        if (bUseGPU)
+        {
+            /* We are done with the CPU compute, but the GPU local non-bonded
+             * kernel can still be running while we communicate the forces.
+             * We start a counter here, so we can, hopefully, time the rest
+             * of the GPU kernel execution and data transfer.
+             */
+            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+        }
+
          /* Communicate the forces */
          wallcycle_start(wcycle, ewcMOVEF);
          dd_move_f(cr->dd, f, fr->fshift);
@@ -1429,13 +1439,44 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          /* wait for local forces (or calculate in emulation mode) */
          if (bUseGPU)
          {
+            float       cycles_tmp, cycles_wait_est;
+            const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
              nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                  nbv->grp[eintLocal].nbat,
                                  flags, eatLocal,
                                  enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                  fr->fshift);
-            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+            if (bDoForces && DOMAINDECOMP(cr))
+            {
+                cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+
+                if (cycles_tmp < cuda_api_overhead_margin)
+                {
+                    /* We measured few cycles, it could be that the kernel
+                     * and transfer finished earlier and there was no actual
+                     * wait time, only API call overhead.
+                     * Then the actual time could be anywhere between 0 and
+                     * cycles_wait_est. As a compromise, we use half the time.
+                     */
+                    cycles_wait_est *= 0.5f;
+                }
+            }
+            else
+            {
+                /* No force communication so we actually timed the wait */
+                cycles_wait_est = cycles_tmp;
+            }
+            /* Even though this is after dd_move_f, the actual task we are
+             * waiting for runs asynchronously with dd_move_f and we usually
+             * have nothing to balance it with, so we can and should add
+             * the time to the force time for load balancing.
+             */
+            cycles_force    += cycles_wait_est;
+            cycles_wait_gpu += cycles_wait_est;
  
              /* now clear the GPU outputs while we finish the step on the CPU */
  
diff --git a/src/gromacs/mdlib/update.c b/src/gromacs/mdlib/update.c

index 3be84a8a93c697047beff45cb75b6ffa8d861312..373212ca8a07a46f49551430fd3b6d5c1d365e6f 100644 (file)
--- a/src/gromacs/mdlib/update.c
+++ b/src/gromacs/mdlib/update.c
@@ -1767,7 +1767,9 @@ void update_constraints(FILE             *fplog,
          }
          else
          {
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+            nth = gmx_omp_nthreads_get(emntUpdate);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
              for (i = start; i < nrend; i++)
              {
                  copy_rvec(upd->xp[i], state->x[i]);
diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c

index 27eb275cd0e088e521f9d693ae731a6f188980c5..3b0d90663631b29e7e15b6844e771ef653781a02 100644 (file)
--- a/src/gromacs/timing/wallcycle.c
+++ b/src/gromacs/timing/wallcycle.c
@@ -99,7 +99,7 @@ static const char *wcn[ewcNR] =
      "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
      "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
      "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
      "Vsite spread", "COM pull force",
      "Write traj.", "Update", "Constraints", "Comm. energies",
      "Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
@@ -388,6 +388,12 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
  
      wcc = wc->wcc;
  
+    /* The GPU wait estimate counter is used for load balancing only
+     * and will mess up the total due to double counting: clear it.
+     */
+    wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
+    wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
+
      for (i = 0; i < ewcNR; i++)
      {
          if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h

index 104173598a16ab5270179208f1f58a793c8f9295..69c92cde722ddb801b16c4bf8114515d8792bf46 100644 (file)
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -54,7 +54,7 @@ enum {
      ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
      ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
      ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
-    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcWAIT_GPU_NB_L_EST, ewcNB_XF_BUF_OPS,
      ewcVSITESPREAD, ewcPULLPOT,
      ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
      ewcTEST, ewcNR
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp

index 9864f8789f9385d2e536ed33f5988bacff85186e..0504150c008e23e09337d2774f24662f049a6474 100644 (file)
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -1881,6 +1881,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                      }
                      dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
  
+                    if (bPMETuneRunning &&
+                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+                        !(cr->duty & DUTY_PME))
+                    {
+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                         * With GPUs + separate PME ranks, we don't want DLB.
+                         * This could happen when we scan coarse grids and
+                         * it would then never be turned off again.
+                         * This would hurt performance at the final, optimal
+                         * grid spacing, where DLB almost never helps.
+                         * Also, DLB can limit the cut-off for PME tuning.
+                         */
+                        dd_dlb_set_lock(cr->dd, TRUE);
+                    }
+
                      if (bPMETuneRunning || step_rel > ir->nstlist*50)
                      {
                          bPMETuneTry     = FALSE;
@@ -1911,6 +1926,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                      {
                          calc_enervirdiff(NULL, ir->eDispCorr, fr);
                      }
+
+                    if (!bPMETuneRunning &&
+                        DOMAINDECOMP(cr) &&
+                        dd_dlb_is_locked(cr->dd))
+                    {
+                        /* Unlock the DLB=auto, DLB is allowed to activate
+                         * (but we don't expect it to activate in most cases).
+                         */
+                        dd_dlb_set_lock(cr->dd, FALSE);
+                    }
                  }
                  cycles_pmes = 0;
              }
diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c

index 5ba76e63e0edab5bba4a85a415fce497367b0386..dd2cd1a2e9f049b01ff6de90078d249f925c0cf7 100644 (file)
--- a/src/programs/mdrun/pme_loadbal.c
+++ b/src/programs/mdrun/pme_loadbal.c
@@ -265,6 +265,15 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t  pme_lb,
      while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
  
      set->rcut_coulomb = pme_lb->cut_spacing*sp;
+    if (set->rcut_coulomb < pme_lb->rcut_coulomb_start)
+    {
+        /* This is unlikely, but can happen when e.g. continuing from
+         * a checkpoint after equilibration where the box shrank a lot.
+         * We want to avoid rcoulomb getting smaller than rvdw
+         * and there might be more issues with decreasing rcoulomb.
+         */
+        set->rcut_coulomb = pme_lb->rcut_coulomb_start;
+    }
  
      if (pme_lb->cutoff_scheme == ecutsVERLET)
      {
author	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 1 Oct 2014 20:02:02 +0000 (22:02 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 1 Oct 2014 21:18:54 +0000 (23:18 +0200)
CMakeLists.txt		patch \| blob \| history
admin/uncrustify.cfg		patch \| blob \| history
cmake/gmxCFlags.cmake		patch \| blob \| history
cmake/gmxTestCompilerProblems.cmake		patch \| blob \| history
src/gromacs/gmxana/gmx_tune_pme.c		patch \| blob \| history
src/gromacs/gmxlib/calcgrid.c		patch \| blob \| history
src/gromacs/gmxpreprocess/toppush.c		patch \| blob \| history
src/gromacs/legacyheaders/domdec.h		patch \| blob \| history
src/gromacs/listed-forces/bonded.cpp		patch \| blob \| history
src/gromacs/mdlib/domdec.cpp		patch \| blob \| history
src/gromacs/mdlib/force.c		patch \| blob \| history
src/gromacs/mdlib/mdatom.c		patch \| blob \| history
src/gromacs/mdlib/minimize.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.c		patch \| blob \| history
src/gromacs/mdlib/ns.c		patch \| blob \| history
src/gromacs/mdlib/sim_util.c		patch \| blob \| history
src/gromacs/mdlib/update.c		patch \| blob \| history
src/gromacs/timing/wallcycle.c		patch \| blob \| history
src/gromacs/timing/wallcycle.h		patch \| blob \| history
src/programs/mdrun/md.cpp		patch \| blob \| history
src/programs/mdrun/pme_loadbal.c		patch \| blob \| history