# To help us fund GROMACS development, we humbly ask that you cite
# the research papers on the package. Check out http://www.gromacs.org.
-# Adapted from code posted on cmake-users by Mark Moll
+# Adapted from code posted on cmake-users by Mark Moll (the execute_process()
+# call remains, but other things have been rewritten for nicer behavior).
find_package(PythonInterp)
-function(find_python_module module)
- string(TOUPPER ${module} module_upper)
- if(NOT PYTHONMODULE_${module_upper})
- if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
- set(${module}_FIND_REQUIRED TRUE)
- endif()
- if (NOT PYTHON_EXECUTABLE)
- message(STATUS "Cannot find python module ${module} because no python executable is known")
- else()
- # A module's location is usually a directory, but for binary modules
- # it's a .so file.
- execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
- "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)"
- RESULT_VARIABLE _${module}_status
- OUTPUT_VARIABLE _${module}_location
- ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+function (find_python_module module)
+ string(TOUPPER ${module} _module_upper)
+ set(_find_package_module ${module})
+ set(_out_var PYTHONMODULE_${_module_upper})
+
+ include(CMakeParseArguments)
+ set(_options QUIET REQUIRED)
+ cmake_parse_arguments(ARG "${_options}" "" "" ${ARGN})
+ if (ARG_UNPARSED_ARGUMENTS)
+ message(FATAL_ERROR "Unknown arguments: ${ARG_UNPARSED_ARGUMENTS}")
+ endif()
+ if (ARG_REQUIRED)
+ set(${_find_package_module}_FIND_REQUIRED TRUE)
+ endif()
+ if (ARG_QUIET)
+ set(${_find_package_module}_FIND_QUIETLY TRUE)
+ endif()
+
+ if (NOT ${_out_var})
+ set(_status 1)
+ if (PYTHON_EXECUTABLE)
+ # A module's location is usually a directory, but for binary modules
+ # it's a .so file.
+ execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+ "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)"
+ RESULT_VARIABLE _status
+ OUTPUT_VARIABLE _location
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+ endif()
+ if(_status)
+ set(_location ${_find_package_module}-NOTFOUND)
endif()
- if(NOT _${module}_status)
- set(PYTHONMODULE_${module_upper} ${_${module}_location} CACHE STRING
- "Location of Python module ${module}")
- endif()
+ set(${_out_var} ${_location} CACHE STRING
+ "Location of Python module ${module}" FORCE)
+ mark_as_advanced(${_out_var})
endif()
- find_package_handle_standard_args(PYTHONMODULE_${module} DEFAULT_MSG PYTHONMODULE_${module_upper})
+ include(FindPackageHandleStandardArgs)
+ find_package_handle_standard_args(
+ ${_find_package_module} DEFAULT_MSG
+ ${_out_var} PYTHON_EXECUTABLE)
endfunction()
mark_as_advanced(SPHINX_EXECUTABLE)
# Detect Sphinx version
-
-if(SPHINX_FOUND AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
+if (SPHINX_EXECUTABLE AND NOT DEFINED SPHINX_EXECUTABLE_VERSION)
execute_process(
COMMAND ${SPHINX_EXECUTABLE} --version
OUTPUT_VARIABLE SPHINX_VERSION_OUTPUT_VARIABLE
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE
)
- string(REGEX REPLACE "Sphinx \\(${SPHINX_EXECUTABLE}\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE})
+ string(REGEX REPLACE "Sphinx \\([^)]*\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE})
set(SPHINX_EXECUTABLE_VERSION "${SPHINX_EXECUTABLE_VERSION}" CACHE INTERNAL "Version of ${SPHINX_EXECUTABLE}")
endif()
+set(_find_deps_options)
+if (Sphinx_FIND_QUIETLY)
+ set(_find_deps_options QUIET)
+endif()
include(FindPythonModule)
-find_python_module(pygments)
-
-if(PYTHONMODULE_PYGMENTS)
+find_python_module(pygments ${_find_deps_options})
+if (PYTHONMODULE_PYGMENTS)
set(Sphinx_pygments_FOUND 1)
endif()
# each release. It's hard to test because it is only used for
# REGRESSIONTEST_DOWNLOAD, which doesn't work until that tarball has
# been placed on the server.
-set(REGRESSIONTEST_MD5SUM "6f8531a6e3c2a8912327b9cd450d8745" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
+set(REGRESSIONTEST_MD5SUM "bb67f145095249e9d4a93227fc4c352e" CACHE INTERNAL "MD5 sum of the regressiontests tarball")
math(EXPR GMX_VERSION_NUMERIC
"${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_MINOR}*100 + ${GMX_VERSION_PATCH}")
set(EXPECTED_DOXYGEN_VERSION 1.8.5)
find_package(PythonInterp)
-find_package(Sphinx 1.2.3 COMPONENTS pygments)
+find_package(Sphinx 1.2.3 QUIET COMPONENTS pygments)
# Even if we aren't going to make the full webpage, set up to put all
# the documentation output in the same place, for convenience
COMMAND ${CMAKE_COMMAND} -E echo
"HTML pages cannot be built because Sphinx is not available"
VERBATIM)
+ add_custom_target(install-guide
+ COMMAND ${CMAKE_COMMAND} -E echo
+ "INSTALL cannot be built because Sphinx is not available"
+ VERBATIM)
add_custom_target(man
COMMAND ${CMAKE_COMMAND} -E echo
"man pages cannot be built because Sphinx is not available"
system, *e.g.* a water slab (see Engin et al. J. Chem. Phys. B
2010).
-.. mdp:: pull-coord1-groups
-
- The two groups indices should be given on which this pull
- coordinate will operate. The first index can be 0, in which case an
- absolute reference of :mdp:`pull-coord1-origin` is used. With an
- absolute reference the system is no longer translation invariant
- and one should think about what to do with the center of mass
- motion. Note that (only) for :mdp:`pull-coord1-geometry` =
- :mdp-value:`direction-relative` four groups are required.
-
.. mdp:: pull-coord1-type:
.. mdp-value:: umbrella
component. This geometry is not supported with constraint
pulling.
+.. mdp:: pull-coord1-groups
+
+ The two groups indices should be given on which this pull
+ coordinate will operate. The first index can be 0, in which case an
+ absolute reference of :mdp:`pull-coord1-origin` is used. With an
+ absolute reference the system is no longer translation invariant
+ and one should think about what to do with the center of mass
+ motion. Note that (only) for :mdp:`pull-coord1-geometry` =
+ :mdp-value:`direction-relative` four groups are required.
+
.. mdp:: pull-coord1-dim
(Y Y Y)
};
enum {
- edlbAUTO, edlbNO, edlbYES, edlbNR
+ edlbsOffForever, /* DLB is off and will never be turned on */
+ edlbsOffCanTurnOn, /* DLB is off and will turn on with imbalance */
+ edlbsOffTemporarilyLocked, /* DLB is off and temporarily can not turn on */
+ edlbsOn, /* DLB is on and will stay on forever */
+ edlbsNR
};
-const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
+/* Allowed DLB state transitions:
+ * edlbsOffCanTurnOn -> edlbsOn
+ * edlbsOffCanTurnOn -> edlbsOffForever
+ * edlbsOffCanTurnOn -> edlbsOffTemporarilyLocked
+ * edlbsOffTemporarilyLocked -> edlbsOffCanTurnOn
+ */
+
+const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
typedef struct
{
t_blocka *cglink;
char *bLocalCG;
- /* The DLB option */
- int eDLB;
- /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
- gmx_bool bDLB_locked;
- /* With eDLB=edlbAUTO, should we check if to DLB on at the next DD? */
+ /* The DLB state, possible values are defined above */
+ int dlbState;
+ /* With dlbState=edlbsOffCanTurnOn, should we check if to DLB on at the next DD? */
gmx_bool bCheckWhetherToTurnDlbOn;
- /* Are we actually using DLB? */
- gmx_bool bDynLoadBal;
/* Cell sizes for static load balancing, first index cartesian */
real **slb_frac;
return &dd->comm->cgs_gl;
}
+static bool dlbIsOn(const gmx_domdec_comm_t *comm)
+{
+ return (comm->dlbState == edlbsOn);
+}
+
static void vec_rvec_init(vec_rvec_t *v)
{
v->nalloc = 0;
dim = dd->dim[d];
shift0[dim] = zones->izone[izone].shift0[dim];
shift1[dim] = zones->izone[izone].shift1[dim];
- if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
+ if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
{
/* A conservative approach, this can be optimized */
shift0[dim] -= 1;
}
}
- if (!comm->bDynLoadBal)
+ if (!dlbIsOn(comm))
{
copy_rvec(cellsize_min, comm->cellsize_min);
}
copy_rvec(comm->cell_x0, comm->old_cell_x0);
copy_rvec(comm->cell_x1, comm->old_cell_x1);
- if (comm->bDynLoadBal)
+ if (dlbIsOn(comm))
{
if (DDMASTER(dd))
{
/* Without PBC we don't have restrictions on the outer cells */
if (!(dim >= ddbox->npbcdim &&
(dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
- comm->bDynLoadBal &&
+ dlbIsOn(comm) &&
(comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
comm->cellsize_min[dim])
{
}
}
- if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
+ if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
{
/* Communicate the boundaries and update cell_ns_x0/1 */
dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
- if (dd->bGridJump && dd->ndim > 1)
+ if (dlbIsOn(dd->comm) && dd->ndim > 1)
{
check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
}
/* Check which direction this cg should go */
for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
{
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
/* The cell boundaries for dimension d2 are not equal
* for each cell row of the lower dimension(s),
(dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
{
load = &comm->load[d];
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
}
{
sbuf[pos++] = dd_force_load(comm);
sbuf[pos++] = sbuf[0];
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
sbuf[pos++] = sbuf[0];
sbuf[pos++] = cell_frac;
{
sbuf[pos++] = comm->load[d+1].sum;
sbuf[pos++] = comm->load[d+1].max;
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
sbuf[pos++] = comm->load[d+1].sum_m;
sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
if (dd->ci[dim] == dd->master_ci[dim])
{
/* We are the root, process this row */
- if (comm->bDynLoadBal)
+ if (dlbIsOn(comm))
{
root = comm->root[d];
}
load->sum += load->load[pos++];
load->max = std::max(load->max, load->load[pos]);
pos++;
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
if (root->bLimited)
{
pos++;
}
}
- if (comm->bDynLoadBal && root->bLimited)
+ if (dlbIsOn(comm) && root->bLimited)
{
load->sum_m *= dd->nc[dim];
load->flags |= (1<<d);
comm->load_step += comm->cycl[ddCyclStep];
comm->load_sum += comm->load[0].sum;
comm->load_max += comm->load[0].max;
- if (comm->bDynLoadBal)
+ if (dlbIsOn(comm))
{
for (d = 0; d < dd->ndim; d++)
{
fprintf(stderr, "%s", buf);
}
bLim = FALSE;
- if (comm->bDynLoadBal)
+ if (dlbIsOn(comm))
{
sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
for (d = 0; d < dd->ndim; d++)
sprintf(buf,
"NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
" in the domain decomposition.\n", lossf*100);
- if (!comm->bDynLoadBal)
+ if (!dlbIsOn(comm))
{
sprintf(buf+strlen(buf), " You might want to use dynamic load balancing (option -dlb.)\n");
}
float dd_pme_f_ratio(gmx_domdec_t *dd)
{
+ /* Should only be called on the DD master rank */
+ assert(DDMASTER(dd));
+
if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
{
return dd->comm->load[0].pme/dd->comm->load[0].mdf;
fprintf(fplog, "\n");
}
fprintf(fplog, "DD step %s", gmx_step_str(step, buf));
- if (dd->comm->bDynLoadBal)
+ if (dlbIsOn(dd->comm))
{
fprintf(fplog, " vol min/aver %5.3f%c",
dd_vol_min(dd), flags ? '!' : ' ');
static void dd_print_load_verbose(gmx_domdec_t *dd)
{
- if (dd->comm->bDynLoadBal)
+ if (dlbIsOn(dd->comm))
{
fprintf(stderr, "vol %4.2f%c ",
dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
if (bPartOfGroup)
{
dd->comm->mpi_comm_load[dim_ind] = c_row;
- if (dd->comm->eDLB != edlbNO)
+ if (dd->comm->dlbState != edlbsOffForever)
{
if (dd->ci[dim] == dd->master_ci[dim])
{
}
}
- if (dd->comm->eDLB != edlbNO)
+ if (dd->comm->dlbState != edlbsOffForever)
{
snew(dd->comm->root, dd->ndim);
}
const char *dlb_opt, gmx_bool bRecordLoad,
unsigned long Flags, t_inputrec *ir)
{
- int eDLB = -1;
+ int dlbState = -1;
char buf[STRLEN];
switch (dlb_opt[0])
{
- case 'a': eDLB = edlbAUTO; break;
- case 'n': eDLB = edlbNO; break;
- case 'y': eDLB = edlbYES; break;
+ case 'a': dlbState = edlbsOffCanTurnOn; break;
+ case 'n': dlbState = edlbsOffForever; break;
+ case 'y': dlbState = edlbsOn; break;
default: gmx_incons("Unknown dlb_opt");
}
if (Flags & MD_RERUN)
{
- return edlbNO;
+ return edlbsOffForever;
}
if (!EI_DYNAMICS(ir->eI))
{
- if (eDLB == edlbYES)
+ if (dlbState == edlbsOn)
{
sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
dd_warning(cr, fplog, buf);
}
- return edlbNO;
+ return edlbsOffForever;
}
if (!bRecordLoad)
{
dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
- return edlbNO;
+ return edlbsOffForever;
}
if (Flags & MD_REPRODUCIBLE)
{
- switch (eDLB)
+ switch (dlbState)
{
- case edlbNO:
+ case edlbsOffForever:
break;
- case edlbAUTO:
+ case edlbsOffCanTurnOn:
dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
- eDLB = edlbNO;
+ dlbState = edlbsOffForever;
break;
- case edlbYES:
+ case edlbsOn:
dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
break;
default:
- gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
+ gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
break;
}
}
- return eDLB;
+ return dlbState;
}
static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
/* Initialize to GPU share count to 0, might change later */
comm->nrank_gpu_shared = 0;
- comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
- comm->bDLB_locked = FALSE;
+ comm->dlbState = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
comm->bCheckWhetherToTurnDlbOn = TRUE;
- comm->bDynLoadBal = (comm->eDLB == edlbYES);
if (fplog)
{
- fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
+ fprintf(fplog, "Dynamic load balancing: %s\n",
+ edlbs_names[comm->dlbState]);
}
- dd->bGridJump = comm->bDynLoadBal;
comm->bPMELoadBalDLBLimits = FALSE;
if (comm->nstSortCG)
/* We need to choose the optimal DD grid and possibly PME nodes */
limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
- comm->eDLB != edlbNO, dlb_scale,
+ comm->dlbState != edlbsOffForever, dlb_scale,
comm->cellsize_limit, comm->cutoff,
comm->bInterCGBondeds);
bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
!bC ? "-rdd" : "-rcon",
- comm->eDLB != edlbNO ? " or -dds" : "",
+ comm->dlbState != edlbsOffForever ? " or -dds" : "",
bC ? " or your LINCS settings" : "");
gmx_fatal_collective(FARGS, cr, NULL,
*npme_y = comm->npmenodes_y;
snew(comm->slb_frac, DIM);
- if (comm->eDLB == edlbNO)
+ if (comm->dlbState == edlbsOffForever)
{
comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
{
- if (comm->bBondComm || comm->eDLB != edlbNO)
+ if (comm->bBondComm || comm->dlbState != edlbsOffForever)
{
/* Set the bonded communication distance to halfway
* the minimum and the maximum,
*/
acs = average_cellsize_min(dd, ddbox);
comm->cutoff_mbody = 0.5*(r_bonded + acs);
- if (comm->eDLB != edlbNO)
+ if (comm->dlbState != edlbsOffForever)
{
/* Check if this does not limit the scaling */
comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
/* Change DLB from "auto" to "no". */
- comm->eDLB = edlbNO;
+ comm->dlbState = edlbsOffForever;
return;
}
dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
- comm->bDynLoadBal = TRUE;
- dd->bGridJump = TRUE;
+ comm->dlbState = edlbsOn;
set_dlb_limits(dd);
std::max(comm->cutoff, comm->cutoff_mbody));
fprintf(fplog, "%40s %-7s %6.3f nm\n",
"multi-body bonded interactions", "(-rdd)",
- (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
+ (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
}
if (dd->vsite_comm)
{
{
comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
}
- if (comm->bDynLoadBal)
+ if (dlbIsOn(comm))
{
set_dlb_limits(dd);
}
{
fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
}
- if (comm->eDLB != edlbNO)
+ if (comm->dlbState != edlbsOffForever)
{
set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
}
- print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
- if (comm->eDLB == edlbAUTO)
+ print_dd_settings(fplog, dd, ir, dlbIsOn(comm), dlb_scale, ddbox);
+ if (comm->dlbState == edlbsOffCanTurnOn)
{
if (fplog)
{
np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
- if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
+ if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
dd->comm->cd[d].np_dlb > 0)
{
if (np > dd->comm->cd[d].np_dlb)
}
}
- if (dd->comm->eDLB != edlbNO)
+ if (dd->comm->dlbState != edlbsOffForever)
{
/* If DLB is not active yet, we don't need to check the grid jumps.
* Actually we shouldn't, because then the grid jump data is not set.
*/
- if (dd->comm->bDynLoadBal &&
+ if (dlbIsOn(dd->comm) &&
check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
{
LocallyLimited = 1;
*/
static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
{
- if (dd->comm->eDLB == edlbAUTO && !dd_dlb_is_locked(dd))
+ if (dd->comm->dlbState == edlbsOffCanTurnOn)
{
dd->comm->bCheckWhetherToTurnDlbOn = bValue;
}
{
const int nddp_chk_dlb = 100;
- if (dd->comm->eDLB != edlbAUTO)
+ if (dd->comm->dlbState != edlbsOffCanTurnOn)
{
return FALSE;
}
gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
{
- return dd->comm->bDynLoadBal;
+ return (dd->comm->dlbState == edlbsOn);
}
gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
{
- return dd->comm->bDLB_locked;
+ return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
}
void dd_dlb_lock(gmx_domdec_t *dd)
{
/* We can only lock the DLB when it is set to auto, otherwise don't do anything */
- if (dd->comm->eDLB == edlbAUTO)
+ if (dd->comm->dlbState == edlbsOffCanTurnOn)
{
- dd->comm->bDLB_locked = TRUE;
+ dd->comm->dlbState = edlbsOffTemporarilyLocked;
}
}
void dd_dlb_unlock(gmx_domdec_t *dd)
{
/* We can only lock the DLB when it is set to auto, otherwise don't do anything */
- if (dd->comm->eDLB == edlbAUTO)
+ if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
{
- dd->comm->bDLB_locked = FALSE;
- dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, !dd->comm->bDynLoadBal);
+ dd->comm->dlbState = edlbsOffCanTurnOn;
+ dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
}
}
c->c[1][0] = comm->cell_x0[dim1];
/* All rows can see this row */
c->c[1][1] = comm->cell_x0[dim1];
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
if (bDistMB)
{
c->c[2][j] = comm->cell_x0[dim2];
}
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
/* Use the maximum of the i-cells that see a j-cell */
for (i = 0; i < zones->nizone; i++)
*/
c->cr1[0] = comm->cell_x1[dim1];
c->cr1[3] = comm->cell_x1[dim1];
- if (dd->bGridJump)
+ if (dlbIsOn(dd->comm))
{
c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
if (bDistMB)
bBondComm = comm->bBondComm;
/* Do we need to determine extra distances for multi-body bondeds? */
- bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+ bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
/* Do we need to determine extra distances for only two-body bondeds? */
bDist2B = (bBondComm && !bDistMB);
zones = &comm->zones;
/* Do we need to determine extra distances for multi-body bondeds? */
- bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+ bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
for (z = zone_start; z < zone_end; z++)
{
/* With a staggered grid we have different sizes
* for non-shifted dimensions.
*/
- if (dd->bGridJump && zones->shift[z][dim] == 0)
+ if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
{
if (d == 1)
{
if (zones->shift[z][dim] > 0)
{
dim = dd->dim[d];
- if (!dd->bGridJump || d == 0)
+ if (!dlbIsOn(dd->comm) || d == 0)
{
zones->size[z].x0[dim] = comm->cell_x1[dim];
zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
bNStGlobalComm = (step % nstglobalcomm == 0);
- if (!comm->bDynLoadBal)
+ if (!dlbIsOn(comm))
{
bDoDLB = FALSE;
}
set_ddbox(dd, bMasterState, cr, ir, state_local->box,
TRUE, &top_local->cgs, state_local->x, &ddbox);
- bRedist = comm->bDynLoadBal;
+ bRedist = dlbIsOn(comm);
}
else
{
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
dd->ci[XX], dd->ci[YY], dd->ci[ZZ],
nrecv_local, ireq->n, specat_type,
specat_type, add_err,
- dd->bGridJump ? " or use the -rcon option of mdrun" : "");
+ dd_dlb_is_on(dd) ? " or use the -rcon option of mdrun" : "");
}
spac->at_start = at_start;
{
pme_lb->bBalance = dd_dlb_is_on(cr->dd);
}
- else
+ /* We should ignore the first timing to avoid timing allocation
+ * overhead. And since the PME load balancing is called just
+ * before DD repartitioning, the ratio returned by dd_pme_f_ratio
+ * is not over the last nstlist steps, but the nstlist steps before
+ * that. So the first useful ratio is available at step_rel=3*nstlist.
+ */
+ else if (step_rel >= 3*ir->nstlist)
{
if (DDMASTER(cr->dd))
{
- /* PME node load is too high, start tuning */
+ /* If PME rank load is too high, start tuning */
pme_lb->bBalance =
(dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
}
}
if (!pme_lb->bBalance &&
- (!pme_lb->bSepPMERanks || (step_rel <= pme_lb->step_rel_stop)))
+ (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
{
/* We have just deactivated the balancing and we're not measuring PP/PME
* imbalance during the first steps of the run: deactivate the tuning.
#include "gromacs/math/units.h"
#include "gromacs/math/utilities.h"
#include "gromacs/math/vec.h"
+#include "gromacs/topology/index.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/futil.h"
#include "gromacs/utility/smalloc.h"
VACF, MVACF, DOS, DOS_SOLID, DOS_DIFF, DOS_CP, DOS_S, DOS_A, DOS_E, DOS_NR
};
+static int calcMoleculesInIndexGroup(t_block *mols, int natoms, atom_id *index, int nindex)
+{
+ int i = 0;
+ int mol = 0;
+ int nMol = 0;
+ int j;
+
+ while (i < nindex)
+ {
+ while (index[i] > mols->index[mol])
+ {
+ mol++;
+ if (mol >= mols->nr)
+ {
+ gmx_fatal(FARGS, "Atom index out of range: %d", index[i]+1);
+ }
+ }
+ for (j = mols->index[mol]; j < mols->index[mol+1]; j++)
+ {
+ if (index[i] != j)
+ {
+ gmx_fatal(FARGS, "The index group does not consist of whole molecules");
+ }
+ i++;
+ if (i == natoms)
+ {
+ gmx_fatal(FARGS, "Index contains atom numbers larger than the topology");
+ }
+ }
+ nMol++;
+ }
+ return nMol;
+}
+
static double FD(double Delta, double f)
{
return (2*pow(Delta, -4.5)*pow(f, 7.5) -
}
}
-static void dump_fy(output_env_t oenv, real toler)
-{
- FILE *fp;
- double Delta, f, y, DD;
- const char *leg[] = { "f", "fy", "y" };
-
- DD = pow(10.0, 0.125);
- fp = xvgropen("fy.xvg", "Fig. 2, Lin2003a", "Delta", "y or fy", oenv);
- xvgr_legend(fp, asize(leg), leg, oenv);
- if (output_env_get_print_xvgr_codes(oenv))
- {
- fprintf(fp, "@ world 1e-05, 0, 1000, 1\n");
- fprintf(fp, "@ xaxes scale Logarithmic\n");
- }
- for (Delta = 1e-5; (Delta <= 1000); Delta *= DD)
- {
- f = calc_fluidicity(Delta, toler);
- y = calc_y(f, Delta, toler);
- fprintf(fp, "%10g %10g %10g %10g\n", Delta, f, f*y, y);
- }
- xvgrclose(fp);
-}
-
-static void dump_w(output_env_t oenv, real beta)
-{
- FILE *fp;
- double nu;
- const char *leg[] = { "wCv", "wS", "wA", "wE" };
-
- fp = xvgropen("w.xvg", "Fig. 1, Berens1983a", "\\f{12}b\\f{4}h\\f{12}n",
- "w", oenv);
- xvgr_legend(fp, asize(leg), leg, oenv);
- for (nu = 1; (nu < 100); nu += 0.05)
- {
- fprintf(fp, "%10g %10g %10g %10g %10g\n", beta*PLANCK*nu,
- wCsolid(nu, beta), wSsolid(nu, beta),
- wAsolid(nu, beta), wEsolid(nu, beta));
- }
- xvgrclose(fp);
-}
-
int gmx_dos(int argc, char *argv[])
{
const char *desc[] = {
"all vibrations. For flexible systems that would be around a few fs",
"between saving. Properties based on the DoS are printed on the",
"standard output."
+ "Note that the density of states is calculated from the mass-weighted",
+ "autocorrelation, and by default only from the square of the real",
+ "component rather than absolute value. This means the shape can differ",
+ "substantially from the plain vibrational power spectrum you can",
+ "calculate with gmx velacc."
};
const char *bugs[] = {
"This program needs a lot of memory: total usage equals the number of atoms times 3 times number of frames times 4 (or 8 when run in double precision)."
gmx_fft_t fft;
double cP, S, A, E, DiffCoeff, Delta, f, y, z, sigHS, Shs, Sig, DoS0, recip_fac;
double wCdiff, wSdiff, wAdiff, wEdiff;
-
- static gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalize = FALSE;
- static gmx_bool bRecip = FALSE, bDump = FALSE;
+ int grpNatoms;
+ atom_id *index;
+ char *grpname;
+ double invNormalize;
+ gmx_bool normalizeAutocorrelation;
+
+ static gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalizeDos = FALSE;
+ static gmx_bool bRecip = FALSE;
static real Temp = 298.15, toler = 1e-6;
+
t_pargs pa[] = {
{ "-v", FALSE, etBOOL, {&bVerbose},
"Be loud and noisy." },
"Use cm^-1 on X-axis instead of 1/ps for DoS plots." },
{ "-abs", FALSE, etBOOL, {&bAbsolute},
"Use the absolute value of the Fourier transform of the VACF as the Density of States. Default is to use the real component only" },
- { "-normdos", FALSE, etBOOL, {&bNormalize},
- "Normalize the DoS such that it adds up to 3N. This is a hack that should not be necessary." },
+ { "-normdos", FALSE, etBOOL, {&bNormalizeDos},
+ "Normalize the DoS such that it adds up to 3N. This should usually not be necessary." },
{ "-T", FALSE, etREAL, {&Temp},
"Temperature in the simulation" },
{ "-toler", FALSE, etREAL, {&toler},
- "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" },
- { "-dump", FALSE, etBOOL, {&bDump},
- "[HIDDEN]Dump the y/fy plot corresponding to Fig. 2 inLin2003a and the and the weighting functions corresponding to Fig. 1 in Berens1983a." }
+ "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" }
};
t_filenm fnm[] = {
}
beta = 1/(Temp*BOLTZ);
- if (bDump)
- {
- printf("Dumping reference figures. Thanks for your patience.\n");
- dump_fy(oenv, toler);
- dump_w(oenv, beta);
- exit(0);
- }
fplog = gmx_fio_fopen(ftp2fn(efLOG, NFILE, fnm), "w");
fprintf(fplog, "Doing density of states analysis based on trajectory.\n");
please_cite(fplog, "Pascal2011a");
please_cite(fplog, "Caleman2011b");
- read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box,
- TRUE);
+ read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box, TRUE);
+
+ /* Handle index groups */
+ get_index(&top.atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &grpNatoms, &index, &grpname);
+
V = det(box);
tmass = 0;
- for (i = 0; (i < top.atoms.nr); i++)
+ for (i = 0; i < grpNatoms; i++)
{
- tmass += top.atoms.atom[i].m;
+ tmass += top.atoms.atom[index[i]].m;
}
- Natom = top.atoms.nr;
- Nmol = top.mols.nr;
+ Natom = grpNatoms;
+ Nmol = calcMoleculesInIndexGroup(&top.mols, top.atoms.nr, index, grpNatoms);
gnx = Natom*DIM;
/* Correlation stuff */
}
for (i = 0; i < gnx; i += DIM)
{
- c1[i+XX][nframes] = fr.v[i/DIM][XX];
- c1[i+YY][nframes] = fr.v[i/DIM][YY];
- c1[i+ZZ][nframes] = fr.v[i/DIM][ZZ];
+ c1[i+XX][nframes] = fr.v[index[i/DIM]][XX];
+ c1[i+YY][nframes] = fr.v[index[i/DIM]][YY];
+ c1[i+ZZ][nframes] = fr.v[index[i/DIM]][ZZ];
}
t1 = fr.time;
printf("Going to do %d fourier transforms of length %d. Hang on.\n",
gnx, nframes);
}
+ /* Unfortunately the -normalize program option for the autocorrelation
+ * function calculation is added as a hack with a static variable in the
+ * autocorrelation.c source. That would work if we called the normal
+ * do_autocorr(), but this routine overrides that by directly calling
+ * the low-level functionality. That unfortunately leads to ignoring the
+ * default value for the option (which is to normalize).
+ * Since the absolute value seems to be important for the subsequent
+ * analysis below, we detect the value directly from the option, calculate
+ * the autocorrelation without normalization, and then apply the
+ * normalization just to the autocorrelation output
+ * (or not, if the user asked for a non-normalized autocorrelation).
+ */
+ normalizeAutocorrelation = opt2parg_bool("-normalize", npargs, ppa);
+
+ /* Note that we always disable normalization here, regardless of user settings */
low_do_autocorr(NULL, oenv, NULL, nframes, gnx, nframes, c1, dt, eacNormal, 0, FALSE,
FALSE, FALSE, -1, -1, 0);
snew(dos, DOS_NR);
}
for (i = 0; (i < gnx); i += DIM)
{
- mi = top.atoms.atom[i/DIM].m;
+ mi = top.atoms.atom[index[i/DIM]].m;
for (j = 0; (j < nframes/2); j++)
{
c1j = (c1[i+XX][j] + c1[i+YY][j] + c1[i+ZZ][j]);
dos[MVACF][j] += mi*c1j;
}
}
- fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity ACF",
+
+ fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity autocorrelation function",
"Time (ps)", "C(t)", oenv);
snew(tt, nframes/2);
+
+ invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0;
+
for (j = 0; (j < nframes/2); j++)
{
tt[j] = j*dt;
- fprintf(fp, "%10g %10g\n", tt[j], dos[VACF][j]);
+ fprintf(fp, "%10g %10g\n", tt[j], dos[VACF][j] * invNormalize);
}
xvgrclose(fp);
- fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity ACF",
+
+ fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity autocorrelation function",
"Time (ps)", "C(t)", oenv);
+
+ invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0;
+
for (j = 0; (j < nframes/2); j++)
{
- fprintf(fp, "%10g %10g\n", tt[j], dos[MVACF][j]);
+ fprintf(fp, "%10g %10g\n", tt[j], dos[MVACF][j] * invNormalize);
}
xvgrclose(fp);
}
/* Normalize it */
dostot = evaluate_integral(nframes/4, nu, dos[DOS], NULL, nframes/4, &stddev);
- if (bNormalize)
+ if (bNormalizeDos)
{
for (j = 0; (j < nframes/4); j++)
{
cP = BOLTZ * evaluate_integral(nframes/4, nu, dos[DOS_CP], NULL,
nframes/4, &stddev);
fprintf(fplog, "Heat capacity %g J/mol K\n", 1000*cP/Nmol);
-
- /*
- S = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_S],NULL,
- nframes/4,&stddev);
- fprintf(fplog,"Entropy %g J/mol K\n",1000*S/Nmol);
- A = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_A],NULL,
- nframes/4,&stddev);
- fprintf(fplog,"Helmholtz energy %g kJ/mol\n",A/Nmol);
- E = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_E],NULL,
- nframes/4,&stddev);
- fprintf(fplog,"Internal energy %g kJ/mol\n",E/Nmol);
- */
fprintf(fplog, "\nArrivederci!\n");
gmx_fio_fclose(fplog);
#ifdef GMX_GPU
-const gmx_bool bGPUBinary = TRUE;
+
+static const bool bGPUBinary = TRUE;
+
# ifdef GMX_USE_OPENCL
-const char *gpu_implementation = "OpenCL";
+
+static const char *gpu_implementation = "OpenCL";
/* Our current OpenCL implementation only supports using exactly one
* GPU per PP rank, so sharing is impossible */
-const gmx_bool bGpuSharingSupported = FALSE;
+static const bool bGpuSharingSupported = false;
/* Our current OpenCL implementation is not known to handle
* concurrency correctly (at context creation, JIT compilation, or JIT
* cache-management stages). OpenCL runtimes need not support it
* either; library MPI segfaults when creating OpenCL contexts;
* thread-MPI seems to work but is not yet known to be safe. */
-const gmx_bool bMultiGpuPerNodeSupported = FALSE;
-# else
-const char *gpu_implementation = "CUDA";
-const gmx_bool bGpuSharingSupported = TRUE;
-const gmx_bool bMultiGpuPerNodeSupported = TRUE;
-# endif
-#else
-const gmx_bool bGPUBinary = FALSE;
-const char *gpu_implementation = "non-GPU";
-const gmx_bool bGpuSharingSupported = FALSE;
-const gmx_bool bMultiGpuPerNodeSupported = FALSE;
-#endif
+static const bool bMultiGpuPerNodeSupported = false;
+
+# else /* GMX_USE_OPENCL */
+
+// Our CUDA implementation supports everything
+static const char *gpu_implementation = "CUDA";
+static const bool bGpuSharingSupported = true;
+static const bool bMultiGpuPerNodeSupported = true;
+
+# endif /* GMX_USE_OPENCL */
+
+#else /* GMX_GPU */
+
+// Not compiled with GPU support
+static const bool bGPUBinary = false;
+static const char *gpu_implementation = "non-GPU";
+static const bool bGpuSharingSupported = false;
+static const bool bMultiGpuPerNodeSupported = false;
+
+#endif /* GMX_GPU */
/* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
const char * const gpu_detect_res_str[egpuNR] =
static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
const gmx_gpu_opt_t *gpu_opt);
+gmx_bool gmx_multiple_gpu_per_node_supported()
+{
+ return bMultiGpuPerNodeSupported;
+}
+
+gmx_bool gmx_gpu_sharing_supported()
+{
+ return bGpuSharingSupported;
+}
+
static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
{
int i, ndev;
}
else
{
- if (ngpu_comp > npppn)
+ /* TODO Should we have a gpu_opt->n_dev_supported field? */
+ if (ngpu_comp > npppn && gmx_multiple_gpu_per_node_supported())
{
md_print_warn(cr, fplog,
"NOTE: potentially sub-optimal launch configuration, %s started with less\n"
*/
if (cr->rank_pp_intranode == 0)
{
+ std::string reasonForLimit;
+ if (ngpu_comp > 1 &&
+ ngpu_use == 1 &&
+ !gmx_multiple_gpu_per_node_supported())
+ {
+ reasonForLimit = "can be used by ";
+ reasonForLimit += gpu_implementation;
+ reasonForLimit += " in GROMACS";
+ }
+ else
+ {
+ reasonForLimit = "was detected";
+ }
gmx_fatal(FARGS,
"Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
- "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
+ "%s was started with %d PP %s%s%s, but only %d GPU%s %s.",
th_or_proc, btMPI ? "s" : "es", pernode,
ShortProgram(), npppn, th_or_proc,
th_or_proc_plural, pernode,
- ngpu_use, gpu_use_plural);
+ ngpu_use, gpu_use_plural, reasonForLimit.c_str());
}
}
}
{
int device_id;
- device_id = bGpuSharingSupported ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
+ device_id = gmx_gpu_sharing_supported() ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
uniq_ids[device_id] = 1;
}
/* Count the devices used. */
parse_digits_from_plain_string(env,
&gpu_opt->n_dev_use,
&gpu_opt->dev_use);
- if (!bMultiGpuPerNodeSupported && 1 < gpu_opt->n_dev_use)
+ if (!gmx_multiple_gpu_per_node_supported() && 1 < gpu_opt->n_dev_use)
{
gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", gpu_implementation);
}
- if (!bGpuSharingSupported && anyGpuIdIsRepeated(gpu_opt))
+ if (!gmx_gpu_sharing_supported() && anyGpuIdIsRepeated(gpu_opt))
{
gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", gpu_implementation);
}
{
if (nrank % gpu_opt->n_dev_compatible == 0)
{
- nshare = bGpuSharingSupported ? nrank/gpu_opt->n_dev_compatible : 1;
+ nshare = gmx_gpu_sharing_supported() ? nrank/gpu_opt->n_dev_compatible : 1;
}
else
{
/* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
- if (!bMultiGpuPerNodeSupported)
+ if (!gmx_multiple_gpu_per_node_supported())
{
gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1);
}
for (std::string::size_type i = 0; i < unescaped_ocl_root_path.length(); i++)
{
- if (inputStr[i] == ' ')
+ if (unescaped_ocl_root_path[i] == ' ')
{
ocl_root_path.push_back('\\');
}
int ngroup;
pcrd = &pull->coord[i-1];
- sprintf(buf, "pull-coord%d-groups", i);
- STYPE(buf, groups, "");
sprintf(buf, "pull-coord%d-type", i);
EETYPE(buf, pcrd->eType, epull_names);
sprintf(buf, "pull-coord%d-geometry", i);
EETYPE(buf, pcrd->eGeom, epullg_names);
+ sprintf(buf, "pull-coord%d-groups", i);
+ STYPE(buf, groups, "");
nscan = sscanf(groups, "%d %d %d %d %d", &pcrd->group[0], &pcrd->group[1], &pcrd->group[2], &pcrd->group[3], &idum);
ngroup = (pcrd->eGeom == epullgDIRRELATIVE) ? 4 : 2;
} /* fixes auto-indentation problems */
#endif
+/*! \brief Return whether mdrun can use more than one GPU per node
+ *
+ * The OpenCL implementation cannot use more than one GPU per node,
+ * for example. */
+gmx_bool gmx_multiple_gpu_per_node_supported();
+
+/*! \brief Return whether PP ranks can share a GPU
+ *
+ * The OpenCL implementation cannot share a GPU between ranks, for
+ * example. */
+gmx_bool gmx_gpu_sharing_supported();
+
/* the init and consistency functions depend on commrec that may not be
consistent in cuda because MPI types don't exist there. */
#ifndef __CUDACC__
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
ivec nc;
int ndim;
ivec dim; /* indexed by 0 to ndim */
- gmx_bool bGridJump;
/* PBC from dim 0 to npbcdim */
int npbcdim;
cl_command_queue stream[2]; /**< local and non-local GPU queues */
/** events used for synchronization */
- cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel
- is done (and the local transfer can proceed) */
- cl_event isc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
- the local stream that need to precede the
- non-local force calculations are done
- (e.g. f buffer 0-ing, local x/q H2D) */
-
- cl_bool bDoTime; /**< True if event-based timing is enabled. */
- cl_timers_t *timers; /**< OpenCL event-based timers. */
- struct gmx_wallclock_gpu_t *timings; /**< Timing data. */
+ cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel
+ is done (and the local transfer can proceed) */
+ cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
+ the local stream that need to precede the
+ non-local force calculations are done
+ (e.g. f buffer 0-ing, local x/q H2D) */
+
+ cl_bool bDoTime; /**< True if event-based timing is enabled. */
+ cl_timers_t *timers; /**< OpenCL event-based timers. */
+ struct gmx_wallclock_gpu_t *timings; /**< Timing data. */
};
#ifdef __cplusplus
nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
}
- if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
+ /* Check if we should split at this cj4 to get a list of size nsp */
+ if (nsp > 0 && nsp + nsp_cj4 > nsp_max)
{
/* Split the list at cj4 */
nbl->sci[sci].cj4_ind_end = cj4;
ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
- ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
+ ls[ZZ] = grid->na_c/(grid->atom_density*ls[XX]*ls[YY]);
/* The average squared length of the diagonal of a sub cell */
xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
/* 4 octants of a sphere */
vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
+ /* Estimate the number of cluster pairs as the local number of
+ * clusters times the volume they interact with times the density.
+ */
nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
/* Subtract the non-local pair count */
nsp_est -= nsp_est_nl;
+ /* For small cut-offs nsp_est will be an underesimate.
+ * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
+ * So to avoid too small or negative nsp_est we set a minimum of
+ * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
+ * This might be a slight overestimate for small non-periodic groups of
+ * atoms as will occur for a local domain with DD, but for small
+ * groups of atoms we'll anyhow be limited by nsubpair_target_min,
+ * so this overestimation will not matter.
+ */
+ nsp_est = max(nsp_est, grid->nsubc_tot*14.0);
+
if (debug)
{
fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
* direction has uniform DD cell boundaries.
*/
bDDRect = !(ddbox->tric_dir[i] ||
- (dd->bGridJump && i != dd->dim[0]));
+ (dd_dlb_is_on(dd) && i != dd->dim[0]));
radd = rlist;
if (i >= ddbox->npbcdim &&
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{
AnalysisDataPlotModulePointer plotm(
new AnalysisDataPlotModule(settings.plotSettings()));
- plotm->setFileName(fnAll_);
+ plotm->setFileName(fnXYZ_);
plotm->setTitle("Distance");
plotm->setXAxisIsTime();
plotm->setYLabel("Distance (nm)");
#include "gromacs/utility/fatalerror.h"
+/* DISCLAIMER: All the atom count and thread numbers below are heuristic.
+ * The real switching points will depend on the system simulation,
+ * the algorithms used and the hardware it's running on, as well as if there
+ * are other jobs running on the same machine. We try to take into account
+ * factors that have a large influence, such as recent Intel CPUs being
+ * much better at wide multi-threading. The remaining factors should
+ * (hopefully) have a small influence, such that the performance just before
+ * and after a switch point doesn't change too much.
+ */
+
+#ifdef GMX_OPENMP
+static const bool bOMP = true;
+#else
+static const bool bOMP = false;
+#endif
+
#ifdef GMX_THREAD_MPI
/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
* the number of threads will get lowered.
/* TODO choose nthreads_omp based on hardware topology
when we have a hardware topology detection library */
/* First we consider the case of no MPI (1 MPI rank).
- * In general, when running up to 4 threads, OpenMP should be faster.
+ * In general, when running up to 8 threads, OpenMP should be faster.
* Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
* On Intel>=Nehalem running OpenMP on a single CPU is always faster,
* even on two CPUs it's usually faster (but with many OpenMP threads
* Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
* model numbers we ensure also future Intel CPUs are covered.
*/
-const int nthreads_omp_always_faster_default = 6;
-const int nthreads_omp_always_faster_Nehalem = 12;
-const int nthreads_omp_always_faster_Intel_AVX = 16;
+const int nthreads_omp_faster_default = 8;
+const int nthreads_omp_faster_Nehalem = 12;
+const int nthreads_omp_faster_Intel_AVX = 16;
/* For CPU only runs the fastest options are usually MPI or OpenMP only.
* With one GPU, using MPI only is almost never optimal, so we need to
* compare running pure OpenMP with combined MPI+OpenMP. This means higher
* OpenMP threads counts can still be ok. Multiplying the numbers above
* by a factor of 2 seems to be a good estimate.
*/
-const int nthreads_omp_always_faster_gpu_fac = 2;
+const int nthreads_omp_faster_gpu_fac = 2;
/* This is the case with MPI (2 or more MPI PP ranks).
* By default we will terminate with a fatal error when more than 8
const int nthreads_omp_mpi_target_max = 6;
-#ifdef GMX_USE_OPENCL
-static const bool bGpuSharingSupported = false;
-#else
-static const bool bGpuSharingSupported = true;
-#endif
-
-
-static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
+/* Returns the maximum OpenMP thread count for which using a single MPI rank
+ * should be faster than using multiple ranks with the same total thread count.
+ */
+static int nthreads_omp_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
{
int nth;
if (gmx_cpuid_vendor(cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX))
{
- nth = nthreads_omp_always_faster_Intel_AVX;
+ nth = nthreads_omp_faster_Intel_AVX;
}
else if (gmx_cpuid_is_intel_nehalem(cpuid_info))
{
- nth = nthreads_omp_always_faster_Nehalem;
+ nth = nthreads_omp_faster_Nehalem;
}
else
{
- nth = nthreads_omp_always_faster_default;
+ nth = nthreads_omp_faster_default;
}
if (bUseGPU)
{
- nth *= nthreads_omp_always_faster_gpu_fac;
+ nth *= nthreads_omp_faster_gpu_fac;
}
nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
return nth;
}
+/* Returns that maximum OpenMP thread count that passes the efficiency check */
+static int nthreads_omp_efficient_max(int gmx_unused nrank,
+ gmx_cpuid_t cpuid_info,
+ gmx_bool bUseGPU)
+{
+#if defined GMX_OPENMP && defined GMX_MPI
+ if (nrank > 1)
+ {
+ return nthreads_omp_mpi_ok_max;
+ }
+ else
+#endif
+ {
+ return nthreads_omp_faster(cpuid_info, bUseGPU);
+ }
+}
+
+/* Return the number of thread-MPI ranks to use.
+ * This is chosen such that we can always obey our own efficiency checks.
+ */
static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
const gmx_hw_opt_t *hw_opt,
int nthreads_tot,
/* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
nrank = nthreads_tot;
}
- else if (bGpuSharingSupported &&
- (nthreads_tot > nthreads_omp_always_faster(hwinfo->cpuid_info,
- ngpu > 0) ||
+ else if (gmx_gpu_sharing_supported() &&
+ (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info,
+ ngpu > 0) ||
(ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
{
/* The high OpenMP thread count will likely result in sub-optimal
}
else
{
- if (nthreads_tot <= nthreads_omp_always_faster(hwinfo->cpuid_info,
- ngpu > 0))
+ if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
{
/* Use pure OpenMP parallelization */
nrank = 1;
}
+static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo, int cutoff_scheme)
+{
+ /* This code relies on the fact that GPU are not detected when GPU
+ * acceleration was disabled at run time by the user.
+ */
+ if (cutoff_scheme == ecutsVERLET &&
+ hwinfo->gpu_info.n_dev_compatible > 0)
+ {
+ if (gmx_multiple_gpu_per_node_supported())
+ {
+ return hwinfo->gpu_info.n_dev_compatible;
+ }
+ else
+ {
+ if (hwinfo->gpu_info.n_dev_compatible > 1)
+ {
+ md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
+ }
+ return 1;
+ }
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+
#ifdef GMX_THREAD_MPI
/* Get the number of MPI ranks to use for thread-MPI based on how many
* were requested, which algorithms we're using,
* with the hardware, except that ntmpi could be larger than #GPU.
*/
int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
- const gmx_hw_opt_t *hw_opt,
+ gmx_hw_opt_t *hw_opt,
const t_inputrec *inputrec,
const gmx_mtop_t *mtop,
const t_commrec *cr,
{
int nthreads_hw, nthreads_tot_max, nrank, ngpu;
int min_atoms_per_mpi_rank;
- gmx_bool bCanUseGPU;
/* Check if an algorithm does not support parallel simulation. */
if (inputrec->eI == eiLBFGS ||
inputrec->coulombtype == eelEWALD)
{
- md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+ md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI rank.\n");
if (hw_opt->nthreads_tmpi > 1)
{
- gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+ gmx_fatal(FARGS, "You asked for more than 1 thread-MPI rank, but an algorithm doesn't support that");
}
return 1;
nthreads_tot_max = nthreads_hw;
}
- bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
- hwinfo->gpu_info.n_dev_compatible > 0);
- if (bCanUseGPU)
- {
- ngpu = hwinfo->gpu_info.n_dev_compatible;
- }
- else
- {
- ngpu = 0;
- }
+ ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme);
if (inputrec->cutoff_scheme == ecutsGROUP)
{
}
else
{
- if (bCanUseGPU)
+ if (ngpu >= 1)
{
min_atoms_per_mpi_rank = min_atoms_per_gpu;
}
nrank = nrank_new;
+ /* We reduced the number of tMPI ranks, which means we might violate
+ * our own efficiency checks if we simply use all hardware threads.
+ */
+ if (bOMP && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
+ {
+ /* The user set neither the total nor the OpenMP thread count,
+ * we should use all hardware threads, unless we will violate
+ * our own efficiency limitation on the thread count.
+ */
+ int nt_omp_max;
+
+ nt_omp_max = nthreads_omp_efficient_max(nrank, hwinfo->cpuid_info, ngpu >= 1);
+
+ if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
+ {
+ /* Limit the number of OpenMP threads to start */
+ hw_opt->nthreads_omp = nt_omp_max;
+ }
+ }
+
fprintf(stderr, "\n");
fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
fprintf(stderr, " only starting %d thread-MPI ranks.\n", nrank);
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
const gmx_hw_opt_t *hw_opt,
- gmx_bool bNTOptSet,
+ gmx_bool bNtOmpOptionSet,
t_commrec *cr,
FILE *fplog)
{
* OpenMP have been initialized. Check that here.
*/
#ifdef GMX_THREAD_MPI
+ assert(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max);
assert(hw_opt->nthreads_tmpi >= 1);
#endif
assert(gmx_omp_nthreads_get(emntDefault) >= 1);
if (DOMAINDECOMP(cr) && cr->nnodes > 1)
{
if (nth_omp_max < nthreads_omp_mpi_ok_min ||
- (!(ngpu > 0 && !bGpuSharingSupported) &&
+ (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
nth_omp_max > nthreads_omp_mpi_ok_max))
{
/* Note that we print target_max here, not ok_max */
nthreads_omp_mpi_ok_min,
nthreads_omp_mpi_target_max);
- if (bNTOptSet)
+ if (bNtOmpOptionSet)
{
md_print_warn(cr, fplog, "NOTE: %s\n", buf);
}
else
{
/* No domain decomposition (or only one domain) */
- if (!(ngpu > 0 && !bGpuSharingSupported) &&
- nth_omp_max > nthreads_omp_always_faster(hwinfo->cpuid_info, ngpu > 0))
+ if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
+ nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
{
/* To arrive here, the user/system set #ranks and/or #OMPthreads */
gmx_bool bEnvSet;
bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);
- if (bNTOptSet || bEnvSet)
+ if (bNtOmpOptionSet || bEnvSet)
{
sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
}
* with different values per rank or node, since in that case
* the user can not set -ntomp to override the error.
*/
- if (bNTOptSet || (bEnvSet && nth_omp_min != nth_omp_max))
+ if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
{
md_print_warn(cr, fplog, "NOTE: %s\n", buf);
}
#else /* GMX_OPENMP && GMX_MPI */
/* No OpenMP and/or MPI: it doesn't make much sense to check */
GMX_UNUSED_VALUE(hw_opt);
- GMX_UNUSED_VALUE(bNTOptSet);
+ GMX_UNUSED_VALUE(bNtOmpOptionSet);
/* Check if we have more than 1 physical core, if detected,
* or more than 1 hardware thread if physical cores were not detected.
*/
}
if (hw_opt->nthreads_tmpi > 0)
{
- gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
+ gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
}
#endif
-#ifndef GMX_OPENMP
- if (hw_opt->nthreads_omp > 1)
+ if (!bOMP)
{
- gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+ }
+ hw_opt->nthreads_omp = 1;
}
- hw_opt->nthreads_omp = 1;
-#endif
if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
{
hw_opt->nthreads_omp > 0 &&
hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
{
- gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+ gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
}
if (hw_opt->nthreads_tmpi > 0 &&
hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
{
- gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+ gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
}
}
}
-#ifndef GMX_OPENMP
- if (hw_opt->nthreads_omp > 1)
+ if (!bOMP && hw_opt->nthreads_omp > 1)
{
gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support");
}
-#endif
if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
{
* At the point we have already called check_and_update_hw_opt.
* Thus all options should be internally consistent and consistent
* with the hardware, except that ntmpi could be larger than #GPU.
+ * If necessary, this function will modify hw_opt->nthreads_omp.
*/
int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
- const gmx_hw_opt_t *hw_opt,
+ gmx_hw_opt_t *hw_opt,
const t_inputrec *inputrec,
const gmx_mtop_t *mtop,
const t_commrec *cr,
* intended to catch cases where the user starts 1 MPI rank per hardware
* thread or 1 rank per physical node.
* With a sub-optimal setup a note is printed to fplog and stderr when
- * bNtOptSet==TRUE; with bNtOptSet==FALSE a fatal error is issued.
+ * bNtOmpSet==TRUE; with bNtOptOptionSet==FALSE a fatal error is issued.
* This function should be called after thread-MPI and OpenMP are set up.
*/
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
const gmx_hw_opt_t *hw_opt,
- gmx_bool bNTOptSet,
+ gmx_bool bNtOmpOptionSet,
t_commrec *cr,
FILE *fplog);
hw_opt,
inputrec, mtop,
cr, fplog);
- if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
- {
- hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
- }
if (hw_opt->nthreads_tmpi > 1)
{