Merge "Merge release-5-1 into master"

author Erik Lindahl <erik.lindahl@gmail.com>

Thu, 9 Jul 2015 10:56:19 +0000 (12:56 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Thu, 9 Jul 2015 10:56:20 +0000 (12:56 +0200)
author Erik Lindahl <erik.lindahl@gmail.com>
Thu, 9 Jul 2015 10:56:19 +0000 (12:56 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 9 Jul 2015 10:56:20 +0000 (12:56 +0200)
diff --git a/cmake/FindPythonModule.cmake b/cmake/FindPythonModule.cmake

index b8e965d1e69ee6955f6faa7d900b6214c8e68758..02773c643f44ee705b0b1ee974cf8f56ad08157b 100644 (file)
--- a/cmake/FindPythonModule.cmake
+++ b/cmake/FindPythonModule.cmake
@@ -32,29 +32,48 @@
  # To help us fund GROMACS development, we humbly ask that you cite
  # the research papers on the package. Check out http://www.gromacs.org.
  
-# Adapted from code posted on cmake-users by Mark Moll
+# Adapted from code posted on cmake-users by Mark Moll (the execute_process()
+# call remains, but other things have been rewritten for nicer behavior).
  find_package(PythonInterp)
-function(find_python_module module)
-    string(TOUPPER ${module} module_upper)
-    if(NOT PYTHONMODULE_${module_upper})
-        if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
-           set(${module}_FIND_REQUIRED TRUE)
-       endif()
-        if (NOT PYTHON_EXECUTABLE)
-            message(STATUS "Cannot find python module ${module} because no python executable is known")
-        else()
-           # A module's location is usually a directory, but for binary modules
-           # it's a .so file.
-           execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-               "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)"
-               RESULT_VARIABLE _${module}_status 
-               OUTPUT_VARIABLE _${module}_location
-               ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+function (find_python_module module)
+    string(TOUPPER ${module} _module_upper)
+    set(_find_package_module ${module})
+    set(_out_var PYTHONMODULE_${_module_upper})
+
+    include(CMakeParseArguments)
+    set(_options QUIET REQUIRED)
+    cmake_parse_arguments(ARG "${_options}" "" "" ${ARGN})
+    if (ARG_UNPARSED_ARGUMENTS)
+        message(FATAL_ERROR "Unknown arguments: ${ARG_UNPARSED_ARGUMENTS}")
+    endif()
+    if (ARG_REQUIRED)
+        set(${_find_package_module}_FIND_REQUIRED TRUE)
+    endif()
+    if (ARG_QUIET)
+        set(${_find_package_module}_FIND_QUIETLY TRUE)
+    endif()
+
+    if (NOT ${_out_var})
+        set(_status 1)
+        if (PYTHON_EXECUTABLE)
+            # A module's location is usually a directory, but for binary modules
+            # it's a .so file.
+            execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+                "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)"
+                RESULT_VARIABLE _status
+                OUTPUT_VARIABLE _location
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+        endif()
+        if(_status)
+            set(_location ${_find_package_module}-NOTFOUND)
          endif()
-       if(NOT _${module}_status)
-           set(PYTHONMODULE_${module_upper} ${_${module}_location} CACHE STRING 
-               "Location of Python module ${module}")
-       endif()
+        set(${_out_var} ${_location} CACHE STRING
+            "Location of Python module ${module}" FORCE)
+        mark_as_advanced(${_out_var})
      endif()
-    find_package_handle_standard_args(PYTHONMODULE_${module} DEFAULT_MSG PYTHONMODULE_${module_upper})
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(
+        ${_find_package_module} DEFAULT_MSG
+        ${_out_var} PYTHON_EXECUTABLE)
  endfunction()
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake

index 46bf447abd082c53b535767578c6650ededb3eee..6eae5130251ce42f8c45fa5af582348e39986d9c 100644 (file)
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -41,8 +41,7 @@ find_program(SPHINX_EXECUTABLE NAMES sphinx-build
  mark_as_advanced(SPHINX_EXECUTABLE)
  
  # Detect Sphinx version
-
-if(SPHINX_FOUND AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
+if (SPHINX_EXECUTABLE AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
      execute_process(
          COMMAND ${SPHINX_EXECUTABLE} --version
          OUTPUT_VARIABLE SPHINX_VERSION_OUTPUT_VARIABLE
@@ -50,14 +49,17 @@ if(SPHINX_FOUND AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
          ERROR_QUIET
          OUTPUT_STRIP_TRAILING_WHITESPACE
          )
-    string(REGEX REPLACE "Sphinx \\(${SPHINX_EXECUTABLE}\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE})
+    string(REGEX REPLACE "Sphinx \\([^)]*\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE})
      set(SPHINX_EXECUTABLE_VERSION "${SPHINX_EXECUTABLE_VERSION}" CACHE INTERNAL "Version of ${SPHINX_EXECUTABLE}")
  endif()
  
+set(_find_deps_options)
+if (Sphinx_FIND_QUIETLY)
+    set(_find_deps_options QUIET)
+endif()
  include(FindPythonModule)
-find_python_module(pygments)
-
-if(PYTHONMODULE_PYGMENTS)
+find_python_module(pygments ${_find_deps_options})
+if (PYTHONMODULE_PYGMENTS)
      set(Sphinx_pygments_FOUND 1)
  endif()
  
diff --git a/cmake/gmxVersionInfo.cmake b/cmake/gmxVersionInfo.cmake

index 162b32818e3d36046e31d45f4c2d84fc83b5a1de..f7a412b281051f9bc5352f926b4885c2f8873f10 100644 (file)
--- a/cmake/gmxVersionInfo.cmake
+++ b/cmake/gmxVersionInfo.cmake
@@ -253,7 +253,7 @@ set(REGRESSIONTEST_BRANCH "refs/heads/master")
  # each release. It's hard to test because it is only used for
  # REGRESSIONTEST_DOWNLOAD, which doesn't work until that tarball has
  # been placed on the server.
-set(REGRESSIONTEST_MD5SUM "6f8531a6e3c2a8912327b9cd450d8745" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
+set(REGRESSIONTEST_MD5SUM "bb67f145095249e9d4a93227fc4c352e" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
  
  math(EXPR GMX_VERSION_NUMERIC
       "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_MINOR}*100 + ${GMX_VERSION_PATCH}")
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt

index d3f190a6358ced923edfa66d449cd76e5acfb4d6..17a7681c8bb774afdaa4c57e95b6b5c4dcf7553c 100644 (file)
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -59,7 +59,7 @@ mark_as_advanced(SOURCE_MD5SUM)
  set(EXPECTED_DOXYGEN_VERSION 1.8.5)
  
  find_package(PythonInterp)
-find_package(Sphinx 1.2.3 COMPONENTS pygments)
+find_package(Sphinx 1.2.3 QUIET COMPONENTS pygments)
  
  # Even if we aren't going to make the full webpage, set up to put all
  # the documentation output in the same place, for convenience
@@ -250,6 +250,10 @@ else()
          COMMAND ${CMAKE_COMMAND} -E echo
              "HTML pages cannot be built because Sphinx is not available"
          VERBATIM)
+    add_custom_target(install-guide
+        COMMAND ${CMAKE_COMMAND} -E echo
+            "INSTALL cannot be built because Sphinx is not available"
+        VERBATIM)
      add_custom_target(man
          COMMAND ${CMAKE_COMMAND} -E echo
              "man pages cannot be built because Sphinx is not available"
diff --git a/docs/user-guide/mdp-options.rst b/docs/user-guide/mdp-options.rst

index 5767bd4f9966c9405716e61103c595bdffdbce84..4fb9c0b204d0e8ed344bc5d32ac9bace94a23294 100644 (file)
--- a/docs/user-guide/mdp-options.rst
+++ b/docs/user-guide/mdp-options.rst
@@ -1695,16 +1695,6 @@ applicable pulling coordinate.
     system, *e.g.* a water slab (see Engin et al. J. Chem. Phys. B
     2010).
  
-.. mdp:: pull-coord1-groups
-
-   The two groups indices should be given on which this pull
-   coordinate will operate. The first index can be 0, in which case an
-   absolute reference of :mdp:`pull-coord1-origin` is used. With an
-   absolute reference the system is no longer translation invariant
-   and one should think about what to do with the center of mass
-   motion. Note that (only) for :mdp:`pull-coord1-geometry` =
-   :mdp-value:`direction-relative` four groups are required.
-
  .. mdp:: pull-coord1-type:
  
     .. mdp-value:: umbrella
@@ -1780,6 +1770,16 @@ applicable pulling coordinate.
        component. This geometry is not supported with constraint
        pulling.
  
+.. mdp:: pull-coord1-groups
+
+   The two groups indices should be given on which this pull
+   coordinate will operate. The first index can be 0, in which case an
+   absolute reference of :mdp:`pull-coord1-origin` is used. With an
+   absolute reference the system is no longer translation invariant
+   and one should think about what to do with the center of mass
+   motion. Note that (only) for :mdp:`pull-coord1-geometry` =
+   :mdp-value:`direction-relative` four groups are required.
+
  .. mdp:: pull-coord1-dim
  
     (Y Y Y)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index e8de945b94ed92b198368896fec394f7ba5f441e..3986ed0562d64a22bccafd585c90c8dcc6926125 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -216,9 +216,20 @@ enum {
  };
  
  enum {
-    edlbAUTO, edlbNO, edlbYES, edlbNR
+    edlbsOffForever,           /* DLB is off and will never be turned on */
+    edlbsOffCanTurnOn,         /* DLB is off and will turn on with imbalance */
+    edlbsOffTemporarilyLocked, /* DLB is off and temporarily can not turn on */
+    edlbsOn,                   /* DLB is on and will stay on forever */
+    edlbsNR
  };
-const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
+/* Allowed DLB state transitions:
+ *   edlbsOffCanTurnOn         -> edlbsOn
+ *   edlbsOffCanTurnOn         -> edlbsOffForever
+ *   edlbsOffCanTurnOn         -> edlbsOffTemporarilyLocked
+ *   edlbsOffTemporarilyLocked -> edlbsOffCanTurnOn
+ */
+
+const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
  
  typedef struct
  {
@@ -297,14 +308,10 @@ typedef struct gmx_domdec_comm
      t_blocka *cglink;
      char     *bLocalCG;
  
-    /* The DLB option */
-    int      eDLB;
-    /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
-    gmx_bool bDLB_locked;
-    /* With eDLB=edlbAUTO, should we check if to DLB on at the next DD? */
+    /* The DLB state, possible values are defined above */
+    int      dlbState;
+    /* With dlbState=edlbsOffCanTurnOn, should we check if to DLB on at the next DD? */
      gmx_bool bCheckWhetherToTurnDlbOn;
-    /* Are we actually using DLB? */
-    gmx_bool bDynLoadBal;
  
      /* Cell sizes for static load balancing, first index cartesian */
      real **slb_frac;
@@ -587,6 +594,11 @@ t_block *dd_charge_groups_global(gmx_domdec_t *dd)
      return &dd->comm->cgs_gl;
  }
  
+static bool dlbIsOn(const gmx_domdec_comm_t *comm)
+{
+    return (comm->dlbState == edlbsOn);
+}
+
  static void vec_rvec_init(vec_rvec_t *v)
  {
      v->nalloc = 0;
@@ -665,7 +677,7 @@ void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
          dim         = dd->dim[d];
          shift0[dim] = zones->izone[izone].shift0[dim];
          shift1[dim] = zones->izone[izone].shift1[dim];
-        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
+        if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
          {
              /* A conservative approach, this can be optimized */
              shift0[dim] -= 1;
@@ -3219,7 +3231,7 @@ static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
          }
      }
  
-    if (!comm->bDynLoadBal)
+    if (!dlbIsOn(comm))
      {
          copy_rvec(cellsize_min, comm->cellsize_min);
      }
@@ -3815,7 +3827,7 @@ static void set_dd_cell_sizes(gmx_domdec_t *dd,
      copy_rvec(comm->cell_x0, comm->old_cell_x0);
      copy_rvec(comm->cell_x1, comm->old_cell_x1);
  
-    if (comm->bDynLoadBal)
+    if (dlbIsOn(comm))
      {
          if (DDMASTER(dd))
          {
@@ -3856,7 +3868,7 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
          /* Without PBC we don't have restrictions on the outer cells */
          if (!(dim >= ddbox->npbcdim &&
                (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
-            comm->bDynLoadBal &&
+            dlbIsOn(comm) &&
              (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
              comm->cellsize_min[dim])
          {
@@ -3870,11 +3882,11 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
          }
      }
  
-    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
+    if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
      {
          /* Communicate the boundaries and update cell_ns_x0/1 */
          dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
-        if (dd->bGridJump && dd->ndim > 1)
+        if (dlbIsOn(dd->comm) && dd->ndim > 1)
          {
              check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
          }
@@ -5000,7 +5012,7 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
                  /* Check which direction this cg should go */
                  for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
                  {
-                    if (dd->bGridJump)
+                    if (dlbIsOn(dd->comm))
                      {
                          /* The cell boundaries for dimension d2 are not equal
                           * for each cell row of the lower dimension(s),
@@ -5299,7 +5311,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
              (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
          {
              load = &comm->load[d];
-            if (dd->bGridJump)
+            if (dlbIsOn(dd->comm))
              {
                  cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
              }
@@ -5308,7 +5320,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
              {
                  sbuf[pos++] = dd_force_load(comm);
                  sbuf[pos++] = sbuf[0];
-                if (dd->bGridJump)
+                if (dlbIsOn(dd->comm))
                  {
                      sbuf[pos++] = sbuf[0];
                      sbuf[pos++] = cell_frac;
@@ -5328,7 +5340,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
              {
                  sbuf[pos++] = comm->load[d+1].sum;
                  sbuf[pos++] = comm->load[d+1].max;
-                if (dd->bGridJump)
+                if (dlbIsOn(dd->comm))
                  {
                      sbuf[pos++] = comm->load[d+1].sum_m;
                      sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
@@ -5357,7 +5369,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
              if (dd->ci[dim] == dd->master_ci[dim])
              {
                  /* We are the root, process this row */
-                if (comm->bDynLoadBal)
+                if (dlbIsOn(comm))
                  {
                      root = comm->root[d];
                  }
@@ -5374,7 +5386,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                      load->sum += load->load[pos++];
                      load->max  = std::max(load->max, load->load[pos]);
                      pos++;
-                    if (dd->bGridJump)
+                    if (dlbIsOn(dd->comm))
                      {
                          if (root->bLimited)
                          {
@@ -5408,7 +5420,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                          pos++;
                      }
                  }
-                if (comm->bDynLoadBal && root->bLimited)
+                if (dlbIsOn(comm) && root->bLimited)
                  {
                      load->sum_m *= dd->nc[dim];
                      load->flags |= (1<<d);
@@ -5423,7 +5435,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
          comm->load_step  += comm->cycl[ddCyclStep];
          comm->load_sum   += comm->load[0].sum;
          comm->load_max   += comm->load[0].max;
-        if (comm->bDynLoadBal)
+        if (dlbIsOn(comm))
          {
              for (d = 0; d < dd->ndim; d++)
              {
@@ -5492,7 +5504,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
              fprintf(stderr, "%s", buf);
          }
          bLim = FALSE;
-        if (comm->bDynLoadBal)
+        if (dlbIsOn(comm))
          {
              sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
              for (d = 0; d < dd->ndim; d++)
@@ -5535,7 +5547,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
              sprintf(buf,
                      "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
                      "      in the domain decomposition.\n", lossf*100);
-            if (!comm->bDynLoadBal)
+            if (!dlbIsOn(comm))
              {
                  sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
              }
@@ -5580,6 +5592,9 @@ static float dd_f_imbal(gmx_domdec_t *dd)
  
  float dd_pme_f_ratio(gmx_domdec_t *dd)
  {
+    /* Should only be called on the DD master rank */
+    assert(DDMASTER(dd));
+
      if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
      {
          return dd->comm->load[0].pme/dd->comm->load[0].mdf;
@@ -5610,7 +5625,7 @@ static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
          fprintf(fplog, "\n");
      }
      fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
-    if (dd->comm->bDynLoadBal)
+    if (dlbIsOn(dd->comm))
      {
          fprintf(fplog, "  vol min/aver %5.3f%c",
                  dd_vol_min(dd), flags ? '!' : ' ');
@@ -5628,7 +5643,7 @@ static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
  
  static void dd_print_load_verbose(gmx_domdec_t *dd)
  {
-    if (dd->comm->bDynLoadBal)
+    if (dlbIsOn(dd->comm))
      {
          fprintf(stderr, "vol %4.2f%c ",
                  dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
@@ -5669,7 +5684,7 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
      if (bPartOfGroup)
      {
          dd->comm->mpi_comm_load[dim_ind] = c_row;
-        if (dd->comm->eDLB != edlbNO)
+        if (dd->comm->dlbState != edlbsOffForever)
          {
              if (dd->ci[dim] == dd->master_ci[dim])
              {
@@ -5963,7 +5978,7 @@ void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
          }
      }
  
-    if (dd->comm->eDLB != edlbNO)
+    if (dd->comm->dlbState != edlbsOffForever)
      {
          snew(dd->comm->root, dd->ndim);
      }
@@ -6540,60 +6555,60 @@ static int check_dlb_support(FILE *fplog, t_commrec *cr,
                               const char *dlb_opt, gmx_bool bRecordLoad,
                               unsigned long Flags, t_inputrec *ir)
  {
-    int           eDLB = -1;
+    int           dlbState = -1;
      char          buf[STRLEN];
  
      switch (dlb_opt[0])
      {
-        case 'a': eDLB = edlbAUTO; break;
-        case 'n': eDLB = edlbNO;   break;
-        case 'y': eDLB = edlbYES;  break;
+        case 'a': dlbState = edlbsOffCanTurnOn; break;
+        case 'n': dlbState = edlbsOffForever;   break;
+        case 'y': dlbState = edlbsOn;           break;
          default: gmx_incons("Unknown dlb_opt");
      }
  
      if (Flags & MD_RERUN)
      {
-        return edlbNO;
+        return edlbsOffForever;
      }
  
      if (!EI_DYNAMICS(ir->eI))
      {
-        if (eDLB == edlbYES)
+        if (dlbState == edlbsOn)
          {
              sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
              dd_warning(cr, fplog, buf);
          }
  
-        return edlbNO;
+        return edlbsOffForever;
      }
  
      if (!bRecordLoad)
      {
          dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
  
-        return edlbNO;
+        return edlbsOffForever;
      }
  
      if (Flags & MD_REPRODUCIBLE)
      {
-        switch (eDLB)
+        switch (dlbState)
          {
-            case edlbNO:
+            case edlbsOffForever:
                  break;
-            case edlbAUTO:
+            case edlbsOffCanTurnOn:
                  dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
-                eDLB = edlbNO;
+                dlbState = edlbsOffForever;
                  break;
-            case edlbYES:
+            case edlbsOn:
                  dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
                  break;
              default:
-                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
+                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
                  break;
          }
      }
  
-    return eDLB;
+    return dlbState;
  }
  
  static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
@@ -6739,16 +6754,14 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      /* Initialize to GPU share count to 0, might change later */
      comm->nrank_gpu_shared = 0;
  
-    comm->eDLB                     = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
-    comm->bDLB_locked              = FALSE;
+    comm->dlbState                 = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
      comm->bCheckWhetherToTurnDlbOn = TRUE;
  
-    comm->bDynLoadBal = (comm->eDLB == edlbYES);
      if (fplog)
      {
-        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
+        fprintf(fplog, "Dynamic load balancing: %s\n",
+                edlbs_names[comm->dlbState]);
      }
-    dd->bGridJump              = comm->bDynLoadBal;
      comm->bPMELoadBalDLBLimits = FALSE;
  
      if (comm->nstSortCG)
@@ -6941,7 +6954,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
  
          /* We need to choose the optimal DD grid and possibly PME nodes */
          limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
-                               comm->eDLB != edlbNO, dlb_scale,
+                               comm->dlbState != edlbsOffForever, dlb_scale,
                                 comm->cellsize_limit, comm->cutoff,
                                 comm->bInterCGBondeds);
  
@@ -6950,7 +6963,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
              bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
              sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
                      !bC ? "-rdd" : "-rcon",
-                    comm->eDLB != edlbNO ? " or -dds" : "",
+                    comm->dlbState != edlbsOffForever ? " or -dds" : "",
                      bC ? " or your LINCS settings" : "");
  
              gmx_fatal_collective(FARGS, cr, NULL,
@@ -7045,7 +7058,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      *npme_y = comm->npmenodes_y;
  
      snew(comm->slb_frac, DIM);
-    if (comm->eDLB == edlbNO)
+    if (comm->dlbState == edlbsOffForever)
      {
          comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
          comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
@@ -7054,7 +7067,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
  
      if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
      {
-        if (comm->bBondComm || comm->eDLB != edlbNO)
+        if (comm->bBondComm || comm->dlbState != edlbsOffForever)
          {
              /* Set the bonded communication distance to halfway
               * the minimum and the maximum,
@@ -7062,7 +7075,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
               */
              acs                = average_cellsize_min(dd, ddbox);
              comm->cutoff_mbody = 0.5*(r_bonded + acs);
-            if (comm->eDLB != edlbNO)
+            if (comm->dlbState != edlbsOffForever)
              {
                  /* Check if this does not limit the scaling */
                  comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
@@ -7153,14 +7166,13 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
          dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
  
          /* Change DLB from "auto" to "no". */
-        comm->eDLB = edlbNO;
+        comm->dlbState = edlbsOffForever;
  
          return;
      }
  
      dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
-    comm->bDynLoadBal = TRUE;
-    dd->bGridJump     = TRUE;
+    comm->dlbState = edlbsOn;
  
      set_dlb_limits(dd);
  
@@ -7331,7 +7343,7 @@ static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
                      std::max(comm->cutoff, comm->cutoff_mbody));
              fprintf(fplog, "%40s  %-7s %6.3f nm\n",
                      "multi-body bonded interactions", "(-rdd)",
-                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
+                    (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
          }
          if (dd->vsite_comm)
          {
@@ -7450,7 +7462,7 @@ static void set_cell_limits_dlb(gmx_domdec_t      *dd,
      {
          comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
      }
-    if (comm->bDynLoadBal)
+    if (dlbIsOn(comm))
      {
          set_dlb_limits(dd);
      }
@@ -7509,13 +7521,13 @@ void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
      {
          fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
      }
-    if (comm->eDLB != edlbNO)
+    if (comm->dlbState != edlbsOffForever)
      {
          set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
      }
  
-    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
-    if (comm->eDLB == edlbAUTO)
+    print_dd_settings(fplog, dd, ir, dlbIsOn(comm), dlb_scale, ddbox);
+    if (comm->dlbState == edlbsOffCanTurnOn)
      {
          if (fplog)
          {
@@ -7571,7 +7583,7 @@ static gmx_bool test_dd_cutoff(t_commrec *cr,
  
          np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
  
-        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
+        if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
              dd->comm->cd[d].np_dlb > 0)
          {
              if (np > dd->comm->cd[d].np_dlb)
@@ -7590,12 +7602,12 @@ static gmx_bool test_dd_cutoff(t_commrec *cr,
          }
      }
  
-    if (dd->comm->eDLB != edlbNO)
+    if (dd->comm->dlbState != edlbsOffForever)
      {
          /* If DLB is not active yet, we don't need to check the grid jumps.
           * Actually we shouldn't, because then the grid jump data is not set.
           */
-        if (dd->comm->bDynLoadBal &&
+        if (dlbIsOn(dd->comm) &&
              check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
          {
              LocallyLimited = 1;
@@ -7655,7 +7667,7 @@ void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
   */
  static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
  {
-    if (dd->comm->eDLB == edlbAUTO && !dd_dlb_is_locked(dd))
+    if (dd->comm->dlbState == edlbsOffCanTurnOn)
      {
          dd->comm->bCheckWhetherToTurnDlbOn = bValue;
      }
@@ -7668,7 +7680,7 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
  {
      const int nddp_chk_dlb = 100;
  
-    if (dd->comm->eDLB != edlbAUTO)
+    if (dd->comm->dlbState != edlbsOffCanTurnOn)
      {
          return FALSE;
      }
@@ -7695,30 +7707,30 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
  
  gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
  {
-    return dd->comm->bDynLoadBal;
+    return (dd->comm->dlbState == edlbsOn);
  }
  
  gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
  {
-    return dd->comm->bDLB_locked;
+    return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
  }
  
  void dd_dlb_lock(gmx_domdec_t *dd)
  {
      /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
-    if (dd->comm->eDLB == edlbAUTO)
+    if (dd->comm->dlbState == edlbsOffCanTurnOn)
      {
-        dd->comm->bDLB_locked = TRUE;
+        dd->comm->dlbState = edlbsOffTemporarilyLocked;
      }
  }
  
  void dd_dlb_unlock(gmx_domdec_t *dd)
  {
      /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
-    if (dd->comm->eDLB == edlbAUTO)
+    if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
      {
-        dd->comm->bDLB_locked = FALSE;
-        dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, !dd->comm->bDynLoadBal);
+        dd->comm->dlbState = edlbsOffCanTurnOn;
+        dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
      }
  }
  
@@ -7883,7 +7895,7 @@ set_dd_corners(const gmx_domdec_t *dd,
          c->c[1][0] = comm->cell_x0[dim1];
          /* All rows can see this row */
          c->c[1][1] = comm->cell_x0[dim1];
-        if (dd->bGridJump)
+        if (dlbIsOn(dd->comm))
          {
              c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
              if (bDistMB)
@@ -7902,7 +7914,7 @@ set_dd_corners(const gmx_domdec_t *dd,
              {
                  c->c[2][j] = comm->cell_x0[dim2];
              }
-            if (dd->bGridJump)
+            if (dlbIsOn(dd->comm))
              {
                  /* Use the maximum of the i-cells that see a j-cell */
                  for (i = 0; i < zones->nizone; i++)
@@ -7937,7 +7949,7 @@ set_dd_corners(const gmx_domdec_t *dd,
               */
              c->cr1[0] = comm->cell_x1[dim1];
              c->cr1[3] = comm->cell_x1[dim1];
-            if (dd->bGridJump)
+            if (dlbIsOn(dd->comm))
              {
                  c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
                  if (bDistMB)
@@ -8276,7 +8288,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
      bBondComm = comm->bBondComm;
  
      /* Do we need to determine extra distances for multi-body bondeds? */
-    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+    bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
  
      /* Do we need to determine extra distances for only two-body bondeds? */
      bDist2B = (bBondComm && !bDistMB);
@@ -8708,7 +8720,7 @@ static void set_zones_size(gmx_domdec_t *dd,
      zones = &comm->zones;
  
      /* Do we need to determine extra distances for multi-body bondeds? */
-    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+    bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
  
      for (z = zone_start; z < zone_end; z++)
      {
@@ -8728,7 +8740,7 @@ static void set_zones_size(gmx_domdec_t *dd,
              /* With a staggered grid we have different sizes
               * for non-shifted dimensions.
               */
-            if (dd->bGridJump && zones->shift[z][dim] == 0)
+            if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
              {
                  if (d == 1)
                  {
@@ -8757,7 +8769,7 @@ static void set_zones_size(gmx_domdec_t *dd,
              if (zones->shift[z][dim] > 0)
              {
                  dim = dd->dim[d];
-                if (!dd->bGridJump || d == 0)
+                if (!dlbIsOn(dd->comm) || d == 0)
                  {
                      zones->size[z].x0[dim] = comm->cell_x1[dim];
                      zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
@@ -9469,7 +9481,7 @@ void dd_partition_system(FILE                *fplog,
  
      bNStGlobalComm = (step % nstglobalcomm == 0);
  
-    if (!comm->bDynLoadBal)
+    if (!dlbIsOn(comm))
      {
          bDoDLB = FALSE;
      }
@@ -9630,7 +9642,7 @@ void dd_partition_system(FILE                *fplog,
          set_ddbox(dd, bMasterState, cr, ir, state_local->box,
                    TRUE, &top_local->cgs, state_local->x, &ddbox);
  
-        bRedist = comm->bDynLoadBal;
+        bRedist = dlbIsOn(comm);
      }
      else
      {
diff --git a/src/gromacs/domdec/domdec_specatomcomm.cpp b/src/gromacs/domdec/domdec_specatomcomm.cpp

index abc52ba315ff1505a201ba75ba33dca2e0f03fdf..75d6a2e3d1d761362878195e719ae99763d44af6 100644 (file)
--- a/src/gromacs/domdec/domdec_specatomcomm.cpp
+++ b/src/gromacs/domdec/domdec_specatomcomm.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -603,7 +603,7 @@ int setup_specat_communication(gmx_domdec_t             *dd,
                    dd->ci[XX], dd->ci[YY], dd->ci[ZZ],
                    nrecv_local, ireq->n, specat_type,
                    specat_type, add_err,
-                  dd->bGridJump ? " or use the -rcon option of mdrun" : "");
+                  dd_dlb_is_on(dd) ? " or use the -rcon option of mdrun" : "");
      }
  
      spac->at_start = at_start;
diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp

index f8facc562183a87e048b29f33b1a87766a935aa9..089361143ab59700537fdd283d5bb1c129a5bf02 100644 (file)
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -934,11 +934,17 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
          {
              pme_lb->bBalance = dd_dlb_is_on(cr->dd);
          }
-        else
+        /* We should ignore the first timing to avoid timing allocation
+         * overhead. And since the PME load balancing is called just
+         * before DD repartitioning, the ratio returned by dd_pme_f_ratio
+         * is not over the last nstlist steps, but the nstlist steps before
+         * that. So the first useful ratio is available at step_rel=3*nstlist.
+         */
+        else if (step_rel >= 3*ir->nstlist)
          {
              if (DDMASTER(cr->dd))
              {
-                /* PME node load is too high, start tuning */
+                /* If PME rank load is too high, start tuning */
                  pme_lb->bBalance =
                      (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
              }
@@ -1019,7 +1025,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
      }
  
      if (!pme_lb->bBalance &&
-        (!pme_lb->bSepPMERanks || (step_rel <= pme_lb->step_rel_stop)))
+        (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
      {
          /* We have just deactivated the balancing and we're not measuring PP/PME
           * imbalance during the first steps of the run: deactivate the tuning.
diff --git a/src/gromacs/gmxana/gmx_dos.c b/src/gromacs/gmxana/gmx_dos.c

index 7592091e9b5253518470e14026979b8945646846..d4423adc47ca1e50d943d761b0b704f1f4d2d1c6 100644 (file)
--- a/src/gromacs/gmxana/gmx_dos.c
+++ b/src/gromacs/gmxana/gmx_dos.c
@@ -56,6 +56,7 @@
  #include "gromacs/math/units.h"
  #include "gromacs/math/utilities.h"
  #include "gromacs/math/vec.h"
+#include "gromacs/topology/index.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/futil.h"
  #include "gromacs/utility/smalloc.h"
@@ -65,6 +66,40 @@ enum {
      VACF, MVACF, DOS, DOS_SOLID, DOS_DIFF, DOS_CP, DOS_S, DOS_A, DOS_E, DOS_NR
  };
  
+static int calcMoleculesInIndexGroup(t_block *mols, int natoms, atom_id *index, int nindex)
+{
+    int   i    = 0;
+    int   mol  = 0;
+    int   nMol = 0;
+    int   j;
+
+    while (i < nindex)
+    {
+        while (index[i] > mols->index[mol])
+        {
+            mol++;
+            if (mol >= mols->nr)
+            {
+                gmx_fatal(FARGS, "Atom index out of range: %d", index[i]+1);
+            }
+        }
+        for (j = mols->index[mol]; j < mols->index[mol+1]; j++)
+        {
+            if (index[i] != j)
+            {
+                gmx_fatal(FARGS, "The index group does not consist of whole molecules");
+            }
+            i++;
+            if (i == natoms)
+            {
+                gmx_fatal(FARGS, "Index contains atom numbers larger than the topology");
+            }
+        }
+        nMol++;
+    }
+    return nMol;
+}
+
  static double FD(double Delta, double f)
  {
      return (2*pow(Delta, -4.5)*pow(f, 7.5) -
@@ -214,47 +249,6 @@ static real wEsolid(real nu, real beta)
      }
  }
  
-static void dump_fy(output_env_t oenv, real toler)
-{
-    FILE       *fp;
-    double      Delta, f, y, DD;
-    const char *leg[] = { "f", "fy", "y" };
-
-    DD = pow(10.0, 0.125);
-    fp = xvgropen("fy.xvg", "Fig. 2, Lin2003a", "Delta", "y or fy", oenv);
-    xvgr_legend(fp, asize(leg), leg, oenv);
-    if (output_env_get_print_xvgr_codes(oenv))
-    {
-        fprintf(fp, "@    world 1e-05, 0, 1000, 1\n");
-        fprintf(fp, "@    xaxes scale Logarithmic\n");
-    }
-    for (Delta = 1e-5; (Delta <= 1000); Delta *= DD)
-    {
-        f = calc_fluidicity(Delta, toler);
-        y = calc_y(f, Delta, toler);
-        fprintf(fp, "%10g  %10g  %10g  %10g\n", Delta, f, f*y, y);
-    }
-    xvgrclose(fp);
-}
-
-static void dump_w(output_env_t oenv, real beta)
-{
-    FILE       *fp;
-    double      nu;
-    const char *leg[] = { "wCv", "wS", "wA", "wE" };
-
-    fp = xvgropen("w.xvg", "Fig. 1, Berens1983a", "\\f{12}b\\f{4}h\\f{12}n",
-                  "w", oenv);
-    xvgr_legend(fp, asize(leg), leg, oenv);
-    for (nu = 1; (nu < 100); nu += 0.05)
-    {
-        fprintf(fp, "%10g  %10g  %10g  %10g  %10g\n", beta*PLANCK*nu,
-                wCsolid(nu, beta), wSsolid(nu, beta),
-                wAsolid(nu, beta), wEsolid(nu, beta));
-    }
-    xvgrclose(fp);
-}
-
  int gmx_dos(int argc, char *argv[])
  {
      const char         *desc[] = {
@@ -264,6 +258,11 @@ int gmx_dos(int argc, char *argv[])
          "all vibrations. For flexible systems that would be around a few fs",
          "between saving. Properties based on the DoS are printed on the",
          "standard output."
+        "Note that the density of states is calculated from the mass-weighted",
+        "autocorrelation, and by default only from the square of the real",
+        "component rather than absolute value. This means the shape can differ",
+        "substantially from the plain vibrational power spectrum you can",
+        "calculate with gmx velacc."
      };
      const char         *bugs[] = {
          "This program needs a lot of memory: total usage equals the number of atoms times 3 times number of frames times 4 (or 8 when run in double precision)."
@@ -284,10 +283,16 @@ int gmx_dos(int argc, char *argv[])
      gmx_fft_t           fft;
      double              cP, S, A, E, DiffCoeff, Delta, f, y, z, sigHS, Shs, Sig, DoS0, recip_fac;
      double              wCdiff, wSdiff, wAdiff, wEdiff;
-
-    static     gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalize = FALSE;
-    static     gmx_bool bRecip   = FALSE, bDump = FALSE;
+    int                 grpNatoms;
+    atom_id            *index;
+    char               *grpname;
+    double              invNormalize;
+    gmx_bool            normalizeAutocorrelation;
+
+    static     gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalizeDos = FALSE;
+    static     gmx_bool bRecip   = FALSE;
      static     real     Temp     = 298.15, toler = 1e-6;
+
      t_pargs             pa[]     = {
          { "-v", FALSE, etBOOL, {&bVerbose},
            "Be loud and noisy." },
@@ -295,14 +300,12 @@ int gmx_dos(int argc, char *argv[])
            "Use cm^-1 on X-axis instead of 1/ps for DoS plots." },
          { "-abs", FALSE, etBOOL, {&bAbsolute},
            "Use the absolute value of the Fourier transform of the VACF as the Density of States. Default is to use the real component only" },
-        { "-normdos", FALSE, etBOOL, {&bNormalize},
-          "Normalize the DoS such that it adds up to 3N. This is a hack that should not be necessary." },
+        { "-normdos", FALSE, etBOOL, {&bNormalizeDos},
+          "Normalize the DoS such that it adds up to 3N. This should usually not be necessary." },
          { "-T", FALSE, etREAL, {&Temp},
            "Temperature in the simulation" },
          { "-toler", FALSE, etREAL, {&toler},
-          "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" },
-        { "-dump", FALSE, etBOOL, {&bDump},
-          "[HIDDEN]Dump the y/fy plot corresponding to Fig. 2 inLin2003a and the and the weighting functions corresponding to Fig. 1 in Berens1983a." }
+          "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" }
      };
  
      t_filenm            fnm[] = {
@@ -331,30 +334,26 @@ int gmx_dos(int argc, char *argv[])
      }
  
      beta = 1/(Temp*BOLTZ);
-    if (bDump)
-    {
-        printf("Dumping reference figures. Thanks for your patience.\n");
-        dump_fy(oenv, toler);
-        dump_w(oenv, beta);
-        exit(0);
-    }
  
      fplog = gmx_fio_fopen(ftp2fn(efLOG, NFILE, fnm), "w");
      fprintf(fplog, "Doing density of states analysis based on trajectory.\n");
      please_cite(fplog, "Pascal2011a");
      please_cite(fplog, "Caleman2011b");
  
-    read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box,
-                  TRUE);
+    read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box, TRUE);
+
+    /* Handle index groups */
+    get_index(&top.atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &grpNatoms, &index, &grpname);
+
      V     = det(box);
      tmass = 0;
-    for (i = 0; (i < top.atoms.nr); i++)
+    for (i = 0; i < grpNatoms; i++)
      {
-        tmass += top.atoms.atom[i].m;
+        tmass += top.atoms.atom[index[i]].m;
      }
  
-    Natom = top.atoms.nr;
-    Nmol  = top.mols.nr;
+    Natom = grpNatoms;
+    Nmol  = calcMoleculesInIndexGroup(&top.mols, top.atoms.nr, index, grpNatoms);
      gnx   = Natom*DIM;
  
      /* Correlation stuff */
@@ -390,9 +389,9 @@ int gmx_dos(int argc, char *argv[])
          }
          for (i = 0; i < gnx; i += DIM)
          {
-            c1[i+XX][nframes] = fr.v[i/DIM][XX];
-            c1[i+YY][nframes] = fr.v[i/DIM][YY];
-            c1[i+ZZ][nframes] = fr.v[i/DIM][ZZ];
+            c1[i+XX][nframes] = fr.v[index[i/DIM]][XX];
+            c1[i+YY][nframes] = fr.v[index[i/DIM]][YY];
+            c1[i+ZZ][nframes] = fr.v[index[i/DIM]][ZZ];
          }
  
          t1 = fr.time;
@@ -413,6 +412,21 @@ int gmx_dos(int argc, char *argv[])
          printf("Going to do %d fourier transforms of length %d. Hang on.\n",
                 gnx, nframes);
      }
+    /* Unfortunately the -normalize program option for the autocorrelation
+     * function calculation is added as a hack with a static variable in the
+     * autocorrelation.c source. That would work if we called the normal
+     * do_autocorr(), but this routine overrides that by directly calling
+     * the low-level functionality. That unfortunately leads to ignoring the
+     * default value for the option (which is to normalize).
+     * Since the absolute value seems to be important for the subsequent
+     * analysis below, we detect the value directly from the option, calculate
+     * the autocorrelation without normalization, and then apply the
+     * normalization just to the autocorrelation output
+     * (or not, if the user asked for a non-normalized autocorrelation).
+     */
+    normalizeAutocorrelation = opt2parg_bool("-normalize", npargs, ppa);
+
+    /* Note that we always disable normalization here, regardless of user settings */
      low_do_autocorr(NULL, oenv, NULL, nframes, gnx, nframes, c1, dt, eacNormal, 0, FALSE,
                      FALSE, FALSE, -1, -1, 0);
      snew(dos, DOS_NR);
@@ -427,7 +441,7 @@ int gmx_dos(int argc, char *argv[])
      }
      for (i = 0; (i < gnx); i += DIM)
      {
-        mi = top.atoms.atom[i/DIM].m;
+        mi = top.atoms.atom[index[i/DIM]].m;
          for (j = 0; (j < nframes/2); j++)
          {
              c1j            = (c1[i+XX][j] + c1[i+YY][j] + c1[i+ZZ][j]);
@@ -435,20 +449,28 @@ int gmx_dos(int argc, char *argv[])
              dos[MVACF][j] += mi*c1j;
          }
      }
-    fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity ACF",
+
+    fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity autocorrelation function",
                    "Time (ps)", "C(t)", oenv);
      snew(tt, nframes/2);
+
+    invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0;
+
      for (j = 0; (j < nframes/2); j++)
      {
          tt[j] = j*dt;
-        fprintf(fp, "%10g  %10g\n", tt[j], dos[VACF][j]);
+        fprintf(fp, "%10g  %10g\n", tt[j], dos[VACF][j] * invNormalize);
      }
      xvgrclose(fp);
-    fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity ACF",
+
+    fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity autocorrelation function",
                    "Time (ps)", "C(t)", oenv);
+
+    invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0;
+
      for (j = 0; (j < nframes/2); j++)
      {
-        fprintf(fp, "%10g  %10g\n", tt[j], dos[MVACF][j]);
+        fprintf(fp, "%10g  %10g\n", tt[j], dos[MVACF][j] * invNormalize);
      }
      xvgrclose(fp);
  
@@ -483,7 +505,7 @@ int gmx_dos(int argc, char *argv[])
      }
      /* Normalize it */
      dostot = evaluate_integral(nframes/4, nu, dos[DOS], NULL, nframes/4, &stddev);
-    if (bNormalize)
+    if (bNormalizeDos)
      {
          for (j = 0; (j < nframes/4); j++)
          {
@@ -571,18 +593,6 @@ int gmx_dos(int argc, char *argv[])
      cP = BOLTZ * evaluate_integral(nframes/4, nu, dos[DOS_CP], NULL,
                                     nframes/4, &stddev);
      fprintf(fplog, "Heat capacity %g J/mol K\n", 1000*cP/Nmol);
-
-    /*
-       S  = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_S],NULL,
-                                   nframes/4,&stddev);
-       fprintf(fplog,"Entropy %g J/mol K\n",1000*S/Nmol);
-       A  = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_A],NULL,
-                                   nframes/4,&stddev);
-       fprintf(fplog,"Helmholtz energy %g kJ/mol\n",A/Nmol);
-       E  = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_E],NULL,
-                                   nframes/4,&stddev);
-       fprintf(fplog,"Internal energy %g kJ/mol\n",E/Nmol);
-     */
      fprintf(fplog, "\nArrivederci!\n");
      gmx_fio_fclose(fplog);
  
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.cpp b/src/gromacs/gmxlib/gmx_detect_hardware.cpp

index 5e4733df1d9ed932ba935cc4c13ddec26156c892..e93fe43d10971cb0dda10d1aa38f38be56fce2d6 100644 (file)
--- a/src/gromacs/gmxlib/gmx_detect_hardware.cpp
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
@@ -77,29 +77,40 @@
  
  
  #ifdef GMX_GPU
-const gmx_bool bGPUBinary = TRUE;
+
+static const bool  bGPUBinary = TRUE;
+
  #  ifdef GMX_USE_OPENCL
-const char    *gpu_implementation        = "OpenCL";
+
+static const char *gpu_implementation       = "OpenCL";
  /* Our current OpenCL implementation only supports using exactly one
   * GPU per PP rank, so sharing is impossible */
-const gmx_bool bGpuSharingSupported      = FALSE;
+static const bool bGpuSharingSupported      = false;
  /* Our current OpenCL implementation is not known to handle
   * concurrency correctly (at context creation, JIT compilation, or JIT
   * cache-management stages). OpenCL runtimes need not support it
   * either; library MPI segfaults when creating OpenCL contexts;
   * thread-MPI seems to work but is not yet known to be safe. */
-const gmx_bool bMultiGpuPerNodeSupported = FALSE;
-#  else
-const char    *gpu_implementation        = "CUDA";
-const gmx_bool bGpuSharingSupported      = TRUE;
-const gmx_bool bMultiGpuPerNodeSupported = TRUE;
-#  endif
-#else
-const gmx_bool bGPUBinary                = FALSE;
-const char    *gpu_implementation        = "non-GPU";
-const gmx_bool bGpuSharingSupported      = FALSE;
-const gmx_bool bMultiGpuPerNodeSupported = FALSE;
-#endif
+static const bool bMultiGpuPerNodeSupported = false;
+
+#  else /* GMX_USE_OPENCL */
+
+// Our CUDA implementation supports everything
+static const char *gpu_implementation        = "CUDA";
+static const bool  bGpuSharingSupported      = true;
+static const bool  bMultiGpuPerNodeSupported = true;
+
+#  endif /* GMX_USE_OPENCL */
+
+#else    /* GMX_GPU */
+
+// Not compiled with GPU support
+static const bool  bGPUBinary                = false;
+static const char *gpu_implementation        = "non-GPU";
+static const bool  bGpuSharingSupported      = false;
+static const bool  bMultiGpuPerNodeSupported = false;
+
+#endif /* GMX_GPU */
  
  /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
  const char * const gpu_detect_res_str[egpuNR] =
@@ -124,6 +135,16 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank);
  static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
                                      const gmx_gpu_opt_t  *gpu_opt);
  
+gmx_bool gmx_multiple_gpu_per_node_supported()
+{
+    return bMultiGpuPerNodeSupported;
+}
+
+gmx_bool gmx_gpu_sharing_supported()
+{
+    return bGpuSharingSupported;
+}
+
  static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
  {
      int      i, ndev;
@@ -440,7 +461,8 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
          }
          else
          {
-            if (ngpu_comp > npppn)
+            /* TODO Should we have a gpu_opt->n_dev_supported field? */
+            if (ngpu_comp > npppn && gmx_multiple_gpu_per_node_supported())
              {
                  md_print_warn(cr, fplog,
                                "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
@@ -460,13 +482,26 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
                   */
                  if (cr->rank_pp_intranode == 0)
                  {
+                    std::string reasonForLimit;
+                    if (ngpu_comp > 1 &&
+                        ngpu_use == 1 &&
+                        !gmx_multiple_gpu_per_node_supported())
+                    {
+                        reasonForLimit  = "can be used by ";
+                        reasonForLimit += gpu_implementation;
+                        reasonForLimit += " in GROMACS";
+                    }
+                    else
+                    {
+                        reasonForLimit = "was detected";
+                    }
                      gmx_fatal(FARGS,
                                "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                              "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
+                              "%s was started with %d PP %s%s%s, but only %d GPU%s %s.",
                                th_or_proc, btMPI ? "s" : "es", pernode,
                                ShortProgram(), npppn, th_or_proc,
                                th_or_proc_plural, pernode,
-                              ngpu_use, gpu_use_plural);
+                              ngpu_use, gpu_use_plural, reasonForLimit.c_str());
                  }
              }
          }
@@ -552,7 +587,7 @@ static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
      {
          int device_id;
  
-        device_id           = bGpuSharingSupported ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
+        device_id           = gmx_gpu_sharing_supported() ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
          uniq_ids[device_id] = 1;
      }
      /* Count the devices used. */
@@ -1121,11 +1156,11 @@ void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
          parse_digits_from_plain_string(env,
                                         &gpu_opt->n_dev_use,
                                         &gpu_opt->dev_use);
-        if (!bMultiGpuPerNodeSupported && 1 < gpu_opt->n_dev_use)
+        if (!gmx_multiple_gpu_per_node_supported() && 1 < gpu_opt->n_dev_use)
          {
              gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", gpu_implementation);
          }
-        if (!bGpuSharingSupported && anyGpuIdIsRepeated(gpu_opt))
+        if (!gmx_gpu_sharing_supported() && anyGpuIdIsRepeated(gpu_opt))
          {
              gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", gpu_implementation);
          }
@@ -1231,7 +1266,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
      {
          if (nrank % gpu_opt->n_dev_compatible == 0)
          {
-            nshare = bGpuSharingSupported ? nrank/gpu_opt->n_dev_compatible : 1;
+            nshare = gmx_gpu_sharing_supported() ? nrank/gpu_opt->n_dev_compatible : 1;
          }
          else
          {
@@ -1252,7 +1287,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
  
      /* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
      gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
-    if (!bMultiGpuPerNodeSupported)
+    if (!gmx_multiple_gpu_per_node_supported())
      {
          gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1);
      }
diff --git a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp

index 783016de1a7766766a1568d8020df9a2c996b025..4e43fd0e1c634829241bb7bb1edc454953260c3b 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp
@@ -750,7 +750,7 @@ ocl_get_build_options_string(cl_context           context,
  
          for (std::string::size_type i = 0; i < unescaped_ocl_root_path.length(); i++)
          {
-            if (inputStr[i] == ' ')
+            if (unescaped_ocl_root_path[i] == ' ')
              {
                  ocl_root_path.push_back('\\');
              }
diff --git a/src/gromacs/gmxpreprocess/readpull.c b/src/gromacs/gmxpreprocess/readpull.c

index 49793283ccf7f5a9bcc232805e68cae1cd5d74cc..f2fb2d220309a6e6e44b9fa8d471b588f773cf1b 100644 (file)
--- a/src/gromacs/gmxpreprocess/readpull.c
+++ b/src/gromacs/gmxpreprocess/readpull.c
@@ -262,12 +262,12 @@ char **read_pullparams(int *ninp_p, t_inpfile **inp_p,
          int ngroup;
  
          pcrd = &pull->coord[i-1];
-        sprintf(buf, "pull-coord%d-groups", i);
-        STYPE(buf,              groups, "");
          sprintf(buf, "pull-coord%d-type", i);
          EETYPE(buf,             pcrd->eType, epull_names);
          sprintf(buf, "pull-coord%d-geometry", i);
          EETYPE(buf,             pcrd->eGeom, epullg_names);
+        sprintf(buf, "pull-coord%d-groups", i);
+        STYPE(buf,              groups, "");
  
          nscan  = sscanf(groups, "%d %d %d %d %d", &pcrd->group[0], &pcrd->group[1],  &pcrd->group[2], &pcrd->group[3], &idum);
          ngroup = (pcrd->eGeom == epullgDIRRELATIVE) ? 4 : 2;
diff --git a/src/gromacs/legacyheaders/gmx_detect_hardware.h b/src/gromacs/legacyheaders/gmx_detect_hardware.h

index 56b0384fe6fee1d84f97e1503a3a13a0e57a8702..338ad8dd81a15684b3c262332514043d6f16c817 100644 (file)
--- a/src/gromacs/legacyheaders/gmx_detect_hardware.h
+++ b/src/gromacs/legacyheaders/gmx_detect_hardware.h
@@ -45,6 +45,18 @@ extern "C" {
  } /* fixes auto-indentation problems */
  #endif
  
+/*! \brief Return whether mdrun can use more than one GPU per node
+ *
+ * The OpenCL implementation cannot use more than one GPU per node,
+ * for example. */
+gmx_bool gmx_multiple_gpu_per_node_supported();
+
+/*! \brief Return whether PP ranks can share a GPU
+ *
+ * The OpenCL implementation cannot share a GPU between ranks, for
+ * example. */
+gmx_bool gmx_gpu_sharing_supported();
+
  /* the init and consistency functions depend on commrec that may not be
     consistent in cuda because MPI types don't exist there.  */
  #ifndef __CUDACC__
diff --git a/src/gromacs/legacyheaders/types/commrec.h b/src/gromacs/legacyheaders/types/commrec.h

index cfc7669a2f1876d0bbd9b55985d4fe15c1ad97ea..64521b85aeee77be7712adc7ada3a7d6dc756142 100644 (file)
--- a/src/gromacs/legacyheaders/types/commrec.h
+++ b/src/gromacs/legacyheaders/types/commrec.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -161,7 +161,6 @@ struct gmx_domdec_t {
      ivec     nc;
      int      ndim;
      ivec     dim; /* indexed by 0 to ndim */
-    gmx_bool bGridJump;
  
      /* PBC from dim 0 to npbcdim */
      int npbcdim;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h

index f6ee45829229e765c638bc25ddce8074884db7bb..e09de482f9e9d3dd0291780243b03a242a3111f1 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -297,16 +297,16 @@ struct gmx_nbnxn_ocl_t
      cl_command_queue    stream[2];      /**< local and non-local GPU queues                             */
  
      /** events used for synchronization */
-    cl_event nonlocal_done;              /**< event triggered when the non-local non-bonded kernel
-                                            is done (and the local transfer can proceed) */
-    cl_event isc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
-                                            the local stream that need to precede the
-                                            non-local force calculations are done
-                                            (e.g. f buffer 0-ing, local x/q H2D) */
-
-    cl_bool                     bDoTime; /**< True if event-based timing is enabled.                     */
-    cl_timers_t                *timers;  /**< OpenCL event-based timers.                                 */
-    struct gmx_wallclock_gpu_t *timings; /**< Timing data.                                               */
+    cl_event nonlocal_done;               /**< event triggered when the non-local non-bonded kernel
+                                             is done (and the local transfer can proceed) */
+    cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
+                                             the local stream that need to precede the
+                                             non-local force calculations are done
+                                             (e.g. f buffer 0-ing, local x/q H2D) */
+
+    cl_bool                     bDoTime;  /**< True if event-based timing is enabled.                     */
+    cl_timers_t                *timers;   /**< OpenCL event-based timers.                                 */
+    struct gmx_wallclock_gpu_t *timings;  /**< Timing data.                                               */
  };
  
  #ifdef __cplusplus
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c

index a3720823567862c8d3740ef5973d352417be082c..16fd3b010127634444485d6a7b183a2192331ad9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -4014,7 +4014,8 @@ static void split_sci_entry(nbnxn_pairlist_t *nbl,
                  nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
              }
  
-            if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
+            /* Check if we should split at this cj4 to get a list of size nsp */
+            if (nsp > 0 && nsp + nsp_cj4 > nsp_max)
              {
                  /* Split the list at cj4 */
                  nbl->sci[sci].cj4_ind_end = cj4;
@@ -4379,7 +4380,7 @@ static void get_nsubpair_target(const nbnxn_search_t  nbs,
  
      ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
      ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
-    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
+    ls[ZZ] = grid->na_c/(grid->atom_density*ls[XX]*ls[YY]);
  
      /* The average squared length of the diagonal of a sub cell */
      xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
@@ -4409,11 +4410,25 @@ static void get_nsubpair_target(const nbnxn_search_t  nbs,
          /* 4 octants of a sphere */
          vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
  
+        /* Estimate the number of cluster pairs as the local number of
+         * clusters times the volume they interact with times the density.
+         */
          nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
  
          /* Subtract the non-local pair count */
          nsp_est -= nsp_est_nl;
  
+        /* For small cut-offs nsp_est will be an underesimate.
+         * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
+         * So to avoid too small or negative nsp_est we set a minimum of
+         * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
+         * This might be a slight overestimate for small non-periodic groups of
+         * atoms as will occur for a local domain with DD, but for small
+         * groups of atoms we'll anyhow be limited by nsubpair_target_min,
+         * so this overestimation will not matter.
+         */
+        nsp_est = max(nsp_est, grid->nsubc_tot*14.0);
+
          if (debug)
          {
              fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
diff --git a/src/gromacs/mdlib/nsgrid.cpp b/src/gromacs/mdlib/nsgrid.cpp

index b92d5a5c765bf73755c01e1355f10872341d303c..63649e3bee792e81ffb6af9f5954d14e7c5ef38e 100644 (file)
--- a/src/gromacs/mdlib/nsgrid.cpp
+++ b/src/gromacs/mdlib/nsgrid.cpp
@@ -250,7 +250,7 @@ static void set_grid_sizes(matrix box, rvec izones_x0, rvec izones_x1, real rlis
               * direction has uniform DD cell boundaries.
               */
              bDDRect = !(ddbox->tric_dir[i] ||
-                        (dd->bGridJump && i != dd->dim[0]));
+                        (dd_dlb_is_on(dd) && i != dd->dim[0]));
  
              radd = rlist;
              if (i >= ddbox->npbcdim &&
diff --git a/src/gromacs/trajectoryanalysis/modules/distance.cpp b/src/gromacs/trajectoryanalysis/modules/distance.cpp

index 96f752b5d54425812a8b7d2dbdda0a8346b67969..4b07d1483a4f5f81da6bba393b2f4a65d0c736bb 100644 (file)
--- a/src/gromacs/trajectoryanalysis/modules/distance.cpp
+++ b/src/gromacs/trajectoryanalysis/modules/distance.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -271,7 +271,7 @@ Distance::initAnalysis(const TrajectoryAnalysisSettings &settings,
      {
          AnalysisDataPlotModulePointer plotm(
                  new AnalysisDataPlotModule(settings.plotSettings()));
-        plotm->setFileName(fnAll_);
+        plotm->setFileName(fnXYZ_);
          plotm->setTitle("Distance");
          plotm->setXAxisIsTime();
          plotm->setYLabel("Distance (nm)");
diff --git a/src/programs/mdrun/resource-division.cpp b/src/programs/mdrun/resource-division.cpp

index 8378845cc1d9ffa213dbe37176fe0f7dda6bccf1..9042aee8a9c860ac8a4399d3abf55c3357de6c8e 100644 (file)
--- a/src/programs/mdrun/resource-division.cpp
+++ b/src/programs/mdrun/resource-division.cpp
@@ -53,6 +53,22 @@
  #include "gromacs/utility/fatalerror.h"
  
  
+/* DISCLAIMER: All the atom count and thread numbers below are heuristic.
+ * The real switching points will depend on the system simulation,
+ * the algorithms used and the hardware it's running on, as well as if there
+ * are other jobs running on the same machine. We try to take into account
+ * factors that have a large influence, such as recent Intel CPUs being
+ * much better at wide multi-threading. The remaining factors should
+ * (hopefully) have a small influence, such that the performance just before
+ * and after a switch point doesn't change too much.
+ */
+
+#ifdef GMX_OPENMP
+static const bool bOMP = true;
+#else
+static const bool bOMP = false;
+#endif
+
  #ifdef GMX_THREAD_MPI
  /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
   * the number of threads will get lowered.
@@ -64,7 +80,7 @@ static const int min_atoms_per_gpu        = 900;
  /* TODO choose nthreads_omp based on hardware topology
     when we have a hardware topology detection library */
  /* First we consider the case of no MPI (1 MPI rank).
- * In general, when running up to 4 threads, OpenMP should be faster.
+ * In general, when running up to 8 threads, OpenMP should be faster.
   * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
   * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
   * even on two CPUs it's usually faster (but with many OpenMP threads
@@ -77,16 +93,16 @@ static const int min_atoms_per_gpu        = 900;
   * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
   * model numbers we ensure also future Intel CPUs are covered.
   */
-const int nthreads_omp_always_faster_default   =  6;
-const int nthreads_omp_always_faster_Nehalem   = 12;
-const int nthreads_omp_always_faster_Intel_AVX = 16;
+const int nthreads_omp_faster_default   =  8;
+const int nthreads_omp_faster_Nehalem   = 12;
+const int nthreads_omp_faster_Intel_AVX = 16;
  /* For CPU only runs the fastest options are usually MPI or OpenMP only.
   * With one GPU, using MPI only is almost never optimal, so we need to
   * compare running pure OpenMP with combined MPI+OpenMP. This means higher
   * OpenMP threads counts can still be ok. Multiplying the numbers above
   * by a factor of 2 seems to be a good estimate.
   */
-const int nthreads_omp_always_faster_gpu_fac   =  2;
+const int nthreads_omp_faster_gpu_fac   =  2;
  
  /* This is the case with MPI (2 or more MPI PP ranks).
   * By default we will terminate with a fatal error when more than 8
@@ -104,34 +120,30 @@ const int nthreads_omp_mpi_ok_min_gpu          =  2;
  const int nthreads_omp_mpi_target_max          =  6;
  
  
-#ifdef GMX_USE_OPENCL
-static const bool bGpuSharingSupported = false;
-#else
-static const bool bGpuSharingSupported = true;
-#endif
-
-
-static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
+/* Returns the maximum OpenMP thread count for which using a single MPI rank
+ * should be faster than using multiple ranks with the same total thread count.
+ */
+static int nthreads_omp_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
  {
      int nth;
  
      if (gmx_cpuid_vendor(cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
          gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX))
      {
-        nth = nthreads_omp_always_faster_Intel_AVX;
+        nth = nthreads_omp_faster_Intel_AVX;
      }
      else if (gmx_cpuid_is_intel_nehalem(cpuid_info))
      {
-        nth = nthreads_omp_always_faster_Nehalem;
+        nth = nthreads_omp_faster_Nehalem;
      }
      else
      {
-        nth = nthreads_omp_always_faster_default;
+        nth = nthreads_omp_faster_default;
      }
  
      if (bUseGPU)
      {
-        nth *= nthreads_omp_always_faster_gpu_fac;
+        nth *= nthreads_omp_faster_gpu_fac;
      }
  
      nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
@@ -139,6 +151,26 @@ static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
      return nth;
  }
  
+/* Returns that maximum OpenMP thread count that passes the efficiency check */
+static int nthreads_omp_efficient_max(int gmx_unused nrank,
+                                      gmx_cpuid_t    cpuid_info,
+                                      gmx_bool       bUseGPU)
+{
+#if defined GMX_OPENMP && defined GMX_MPI
+    if (nrank > 1)
+    {
+        return nthreads_omp_mpi_ok_max;
+    }
+    else
+#endif
+    {
+        return nthreads_omp_faster(cpuid_info, bUseGPU);
+    }
+}
+
+/* Return the number of thread-MPI ranks to use.
+ * This is chosen such that we can always obey our own efficiency checks.
+ */
  static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
                                          const gmx_hw_opt_t  *hw_opt,
                                          int                  nthreads_tot,
@@ -161,9 +193,9 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
              /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
              nrank = nthreads_tot;
          }
-        else if (bGpuSharingSupported &&
-                 (nthreads_tot > nthreads_omp_always_faster(hwinfo->cpuid_info,
-                                                            ngpu > 0) ||
+        else if (gmx_gpu_sharing_supported() &&
+                 (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info,
+                                                     ngpu > 0) ||
                    (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
          {
              /* The high OpenMP thread count will likely result in sub-optimal
@@ -193,8 +225,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
      }
      else
      {
-        if (nthreads_tot <= nthreads_omp_always_faster(hwinfo->cpuid_info,
-                                                       ngpu > 0))
+        if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
          {
              /* Use pure OpenMP parallelization */
              nrank = 1;
@@ -210,6 +241,34 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
  }
  
  
+static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo, int cutoff_scheme)
+{
+    /* This code relies on the fact that GPU are not detected when GPU
+     * acceleration was disabled at run time by the user.
+     */
+    if (cutoff_scheme == ecutsVERLET &&
+        hwinfo->gpu_info.n_dev_compatible > 0)
+    {
+        if (gmx_multiple_gpu_per_node_supported())
+        {
+            return hwinfo->gpu_info.n_dev_compatible;
+        }
+        else
+        {
+            if (hwinfo->gpu_info.n_dev_compatible > 1)
+            {
+                md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
+            }
+            return 1;
+        }
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+
  #ifdef GMX_THREAD_MPI
  /* Get the number of MPI ranks to use for thread-MPI based on how many
   * were requested, which algorithms we're using,
@@ -219,7 +278,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
   * with the hardware, except that ntmpi could be larger than #GPU.
   */
  int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
-                     const gmx_hw_opt_t  *hw_opt,
+                     gmx_hw_opt_t        *hw_opt,
                       const t_inputrec    *inputrec,
                       const gmx_mtop_t    *mtop,
                       const t_commrec     *cr,
@@ -227,16 +286,15 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  {
      int      nthreads_hw, nthreads_tot_max, nrank, ngpu;
      int      min_atoms_per_mpi_rank;
-    gmx_bool bCanUseGPU;
  
      /* Check if an algorithm does not support parallel simulation.  */
      if (inputrec->eI == eiLBFGS ||
          inputrec->coulombtype == eelEWALD)
      {
-        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI rank.\n");
          if (hw_opt->nthreads_tmpi > 1)
          {
-            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI rank, but an algorithm doesn't support that");
          }
  
          return 1;
@@ -266,16 +324,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
          nthreads_tot_max = nthreads_hw;
      }
  
-    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
-                  hwinfo->gpu_info.n_dev_compatible > 0);
-    if (bCanUseGPU)
-    {
-        ngpu = hwinfo->gpu_info.n_dev_compatible;
-    }
-    else
-    {
-        ngpu = 0;
-    }
+    ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme);
  
      if (inputrec->cutoff_scheme == ecutsGROUP)
      {
@@ -297,7 +346,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
      }
      else
      {
-        if (bCanUseGPU)
+        if (ngpu >= 1)
          {
              min_atoms_per_mpi_rank = min_atoms_per_gpu;
          }
@@ -362,6 +411,26 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  
          nrank = nrank_new;
  
+        /* We reduced the number of tMPI ranks, which means we might violate
+         * our own efficiency checks if we simply use all hardware threads.
+         */
+        if (bOMP && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
+        {
+            /* The user set neither the total nor the OpenMP thread count,
+             * we should use all hardware threads, unless we will violate
+             * our own efficiency limitation on the thread count.
+             */
+            int  nt_omp_max;
+
+            nt_omp_max = nthreads_omp_efficient_max(nrank, hwinfo->cpuid_info, ngpu >= 1);
+
+            if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
+            {
+                /* Limit the number of OpenMP threads to start */
+                hw_opt->nthreads_omp = nt_omp_max;
+            }
+        }
+
          fprintf(stderr, "\n");
          fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
          fprintf(stderr, "      only starting %d thread-MPI ranks.\n", nrank);
@@ -375,7 +444,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  
  void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                          const gmx_hw_opt_t  *hw_opt,
-                                        gmx_bool             bNTOptSet,
+                                        gmx_bool             bNtOmpOptionSet,
                                          t_commrec           *cr,
                                          FILE                *fplog)
  {
@@ -392,6 +461,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
       * OpenMP have been initialized. Check that here.
       */
  #ifdef GMX_THREAD_MPI
+    assert(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max);
      assert(hw_opt->nthreads_tmpi >= 1);
  #endif
      assert(gmx_omp_nthreads_get(emntDefault) >= 1);
@@ -434,7 +504,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
      if (DOMAINDECOMP(cr) && cr->nnodes > 1)
      {
          if (nth_omp_max < nthreads_omp_mpi_ok_min ||
-            (!(ngpu > 0 && !bGpuSharingSupported) &&
+            (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
               nth_omp_max > nthreads_omp_mpi_ok_max))
          {
              /* Note that we print target_max here, not ok_max */
@@ -443,7 +513,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                      nthreads_omp_mpi_ok_min,
                      nthreads_omp_mpi_target_max);
  
-            if (bNTOptSet)
+            if (bNtOmpOptionSet)
              {
                  md_print_warn(cr, fplog, "NOTE: %s\n", buf);
              }
@@ -460,8 +530,8 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
      else
      {
          /* No domain decomposition (or only one domain) */
-        if (!(ngpu > 0 && !bGpuSharingSupported) &&
-            nth_omp_max > nthreads_omp_always_faster(hwinfo->cpuid_info, ngpu > 0))
+        if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
+            nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
          {
              /* To arrive here, the user/system set #ranks and/or #OMPthreads */
              gmx_bool bEnvSet;
@@ -469,7 +539,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
  
              bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);
  
-            if (bNTOptSet || bEnvSet)
+            if (bNtOmpOptionSet || bEnvSet)
              {
                  sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
              }
@@ -489,7 +559,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
               * with different values per rank or node, since in that case
               * the user can not set -ntomp to override the error.
               */
-            if (bNTOptSet || (bEnvSet && nth_omp_min != nth_omp_max))
+            if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
              {
                  md_print_warn(cr, fplog, "NOTE: %s\n", buf);
              }
@@ -502,7 +572,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
  #else /* GMX_OPENMP && GMX_MPI */
        /* No OpenMP and/or MPI: it doesn't make much sense to check */
      GMX_UNUSED_VALUE(hw_opt);
-    GMX_UNUSED_VALUE(bNTOptSet);
+    GMX_UNUSED_VALUE(bNtOmpOptionSet);
      /* Check if we have more than 1 physical core, if detected,
       * or more than 1 hardware thread if physical cores were not detected.
       */
@@ -538,17 +608,18 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
      }
      if (hw_opt->nthreads_tmpi > 0)
      {
-        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
+        gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
      }
  #endif
  
-#ifndef GMX_OPENMP
-    if (hw_opt->nthreads_omp > 1)
+    if (!bOMP)
      {
-        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+        }
+        hw_opt->nthreads_omp = 1;
      }
-    hw_opt->nthreads_omp = 1;
-#endif
  
      if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
      {
@@ -559,14 +630,14 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
              hw_opt->nthreads_omp > 0 &&
              hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
          {
-            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
                        hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
          }
  
          if (hw_opt->nthreads_tmpi > 0 &&
              hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
          {
-            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
                        hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
          }
  
@@ -584,12 +655,10 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
          }
      }
  
-#ifndef GMX_OPENMP
-    if (hw_opt->nthreads_omp > 1)
+    if (!bOMP && hw_opt->nthreads_omp > 1)
      {
          gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support");
      }
-#endif
  
      if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
      {
diff --git a/src/programs/mdrun/resource-division.h b/src/programs/mdrun/resource-division.h

index ca982377414f2c575358fdf0e5018513f01d9173..42f4f05182472753a829148947068115549e7129 100644 (file)
--- a/src/programs/mdrun/resource-division.h
+++ b/src/programs/mdrun/resource-division.h
@@ -45,9 +45,10 @@
   * At the point we have already called check_and_update_hw_opt.
   * Thus all options should be internally consistent and consistent
   * with the hardware, except that ntmpi could be larger than #GPU.
+ * If necessary, this function will modify hw_opt->nthreads_omp.
   */
  int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
-                     const gmx_hw_opt_t  *hw_opt,
+                     gmx_hw_opt_t        *hw_opt,
                       const t_inputrec    *inputrec,
                       const gmx_mtop_t    *mtop,
                       const t_commrec     *cr,
@@ -58,12 +59,12 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
   * intended to catch cases where the user starts 1 MPI rank per hardware
   * thread or 1 rank per physical node.
   * With a sub-optimal setup a note is printed to fplog and stderr when
- * bNtOptSet==TRUE; with bNtOptSet==FALSE a fatal error is issued.
+ * bNtOmpSet==TRUE; with bNtOptOptionSet==FALSE a fatal error is issued.
   * This function should be called after thread-MPI and OpenMP are set up.
   */
  void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                          const gmx_hw_opt_t  *hw_opt,
-                                        gmx_bool             bNTOptSet,
+                                        gmx_bool             bNtOmpOptionSet,
                                          t_commrec           *cr,
                                          FILE                *fplog);
  
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp

index a917c62ec581adecdcb06d00ab9b4d4d070b2a29..f25ad46c9b9c3b7bf35ba5d9927ce902b21bae19 100644 (file)
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -781,10 +781,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                                                   hw_opt,
                                                   inputrec, mtop,
                                                   cr, fplog);
-        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
-        {
-            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
-        }
  
          if (hw_opt->nthreads_tmpi > 1)
          {
author	Erik Lindahl <erik.lindahl@gmail.com>
	Thu, 9 Jul 2015 10:56:19 +0000 (12:56 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Thu, 9 Jul 2015 10:56:20 +0000 (12:56 +0200)
cmake/FindPythonModule.cmake		patch \| blob \| history
cmake/FindSphinx.cmake		patch \| blob \| history
cmake/gmxVersionInfo.cmake		patch \| blob \| history
docs/CMakeLists.txt		patch \| blob \| history
docs/user-guide/mdp-options.rst		patch \| blob \| history
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec_specatomcomm.cpp		patch \| blob \| history
src/gromacs/ewald/pme-load-balancing.cpp		patch \| blob \| history
src/gromacs/gmxana/gmx_dos.c		patch \| blob \| history
src/gromacs/gmxlib/gmx_detect_hardware.cpp		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp		patch \| blob \| history
src/gromacs/gmxpreprocess/readpull.c		patch \| blob \| history
src/gromacs/legacyheaders/gmx_detect_hardware.h		patch \| blob \| history
src/gromacs/legacyheaders/types/commrec.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.c		patch \| blob \| history
src/gromacs/mdlib/nsgrid.cpp		patch \| blob \| history
src/gromacs/trajectoryanalysis/modules/distance.cpp		patch \| blob \| history
src/programs/mdrun/resource-division.cpp		patch \| blob \| history
src/programs/mdrun/resource-division.h		patch \| blob \| history
src/programs/mdrun/runner.cpp		patch \| blob \| history