From: Erik Lindahl Date: Thu, 9 Jul 2015 10:56:19 +0000 (+0200) Subject: Merge "Merge release-5-1 into master" X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=be4c77e5c2c4a1512110217d0a3386b6945d11e7;hp=b7cb5f5232b4edb2118b2ba7bd78ef35adebad04;p=alexxy%2Fgromacs.git Merge "Merge release-5-1 into master" --- diff --git a/cmake/FindPythonModule.cmake b/cmake/FindPythonModule.cmake index b8e965d1e6..02773c643f 100644 --- a/cmake/FindPythonModule.cmake +++ b/cmake/FindPythonModule.cmake @@ -32,29 +32,48 @@ # To help us fund GROMACS development, we humbly ask that you cite # the research papers on the package. Check out http://www.gromacs.org. -# Adapted from code posted on cmake-users by Mark Moll +# Adapted from code posted on cmake-users by Mark Moll (the execute_process() +# call remains, but other things have been rewritten for nicer behavior). find_package(PythonInterp) -function(find_python_module module) - string(TOUPPER ${module} module_upper) - if(NOT PYTHONMODULE_${module_upper}) - if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED") - set(${module}_FIND_REQUIRED TRUE) - endif() - if (NOT PYTHON_EXECUTABLE) - message(STATUS "Cannot find python module ${module} because no python executable is known") - else() - # A module's location is usually a directory, but for binary modules - # it's a .so file. - execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" - "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)" - RESULT_VARIABLE _${module}_status - OUTPUT_VARIABLE _${module}_location - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + +function (find_python_module module) + string(TOUPPER ${module} _module_upper) + set(_find_package_module ${module}) + set(_out_var PYTHONMODULE_${_module_upper}) + + include(CMakeParseArguments) + set(_options QUIET REQUIRED) + cmake_parse_arguments(ARG "${_options}" "" "" ${ARGN}) + if (ARG_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "Unknown arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + if (ARG_REQUIRED) + set(${_find_package_module}_FIND_REQUIRED TRUE) + endif() + if (ARG_QUIET) + set(${_find_package_module}_FIND_QUIETLY TRUE) + endif() + + if (NOT ${_out_var}) + set(_status 1) + if (PYTHON_EXECUTABLE) + # A module's location is usually a directory, but for binary modules + # it's a .so file. + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import re, ${module}; print re.compile('/__init__.py.*').sub('',${module}.__file__)" + RESULT_VARIABLE _status + OUTPUT_VARIABLE _location + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() + if(_status) + set(_location ${_find_package_module}-NOTFOUND) endif() - if(NOT _${module}_status) - set(PYTHONMODULE_${module_upper} ${_${module}_location} CACHE STRING - "Location of Python module ${module}") - endif() + set(${_out_var} ${_location} CACHE STRING + "Location of Python module ${module}" FORCE) + mark_as_advanced(${_out_var}) endif() - find_package_handle_standard_args(PYTHONMODULE_${module} DEFAULT_MSG PYTHONMODULE_${module_upper}) + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args( + ${_find_package_module} DEFAULT_MSG + ${_out_var} PYTHON_EXECUTABLE) endfunction() diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake index 46bf447abd..6eae513025 100644 --- a/cmake/FindSphinx.cmake +++ b/cmake/FindSphinx.cmake @@ -41,8 +41,7 @@ find_program(SPHINX_EXECUTABLE NAMES sphinx-build mark_as_advanced(SPHINX_EXECUTABLE) # Detect Sphinx version - -if(SPHINX_FOUND AND NOT DEFINED SPHINX_EXECUTABLE_VERSION) +if (SPHINX_EXECUTABLE AND NOT DEFINED SPHINX_EXECUTABLE_VERSION) execute_process( COMMAND ${SPHINX_EXECUTABLE} --version OUTPUT_VARIABLE SPHINX_VERSION_OUTPUT_VARIABLE @@ -50,14 +49,17 @@ if(SPHINX_FOUND AND NOT DEFINED SPHINX_EXECUTABLE_VERSION) ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE ) - string(REGEX REPLACE "Sphinx \\(${SPHINX_EXECUTABLE}\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE}) + string(REGEX REPLACE "Sphinx \\([^)]*\\) ([^ ]+)" "\\1" SPHINX_EXECUTABLE_VERSION ${SPHINX_VERSION_OUTPUT_VARIABLE}) set(SPHINX_EXECUTABLE_VERSION "${SPHINX_EXECUTABLE_VERSION}" CACHE INTERNAL "Version of ${SPHINX_EXECUTABLE}") endif() +set(_find_deps_options) +if (Sphinx_FIND_QUIETLY) + set(_find_deps_options QUIET) +endif() include(FindPythonModule) -find_python_module(pygments) - -if(PYTHONMODULE_PYGMENTS) +find_python_module(pygments ${_find_deps_options}) +if (PYTHONMODULE_PYGMENTS) set(Sphinx_pygments_FOUND 1) endif() diff --git a/cmake/gmxVersionInfo.cmake b/cmake/gmxVersionInfo.cmake index 162b32818e..f7a412b281 100644 --- a/cmake/gmxVersionInfo.cmake +++ b/cmake/gmxVersionInfo.cmake @@ -253,7 +253,7 @@ set(REGRESSIONTEST_BRANCH "refs/heads/master") # each release. It's hard to test because it is only used for # REGRESSIONTEST_DOWNLOAD, which doesn't work until that tarball has # been placed on the server. -set(REGRESSIONTEST_MD5SUM "6f8531a6e3c2a8912327b9cd450d8745" CACHE INTERNAL "MD5 sum of the regressiontests tarball") +set(REGRESSIONTEST_MD5SUM "bb67f145095249e9d4a93227fc4c352e" CACHE INTERNAL "MD5 sum of the regressiontests tarball") math(EXPR GMX_VERSION_NUMERIC "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_MINOR}*100 + ${GMX_VERSION_PATCH}") diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index d3f190a635..17a7681c8b 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -59,7 +59,7 @@ mark_as_advanced(SOURCE_MD5SUM) set(EXPECTED_DOXYGEN_VERSION 1.8.5) find_package(PythonInterp) -find_package(Sphinx 1.2.3 COMPONENTS pygments) +find_package(Sphinx 1.2.3 QUIET COMPONENTS pygments) # Even if we aren't going to make the full webpage, set up to put all # the documentation output in the same place, for convenience @@ -250,6 +250,10 @@ else() COMMAND ${CMAKE_COMMAND} -E echo "HTML pages cannot be built because Sphinx is not available" VERBATIM) + add_custom_target(install-guide + COMMAND ${CMAKE_COMMAND} -E echo + "INSTALL cannot be built because Sphinx is not available" + VERBATIM) add_custom_target(man COMMAND ${CMAKE_COMMAND} -E echo "man pages cannot be built because Sphinx is not available" diff --git a/docs/user-guide/mdp-options.rst b/docs/user-guide/mdp-options.rst index 5767bd4f99..4fb9c0b204 100644 --- a/docs/user-guide/mdp-options.rst +++ b/docs/user-guide/mdp-options.rst @@ -1695,16 +1695,6 @@ applicable pulling coordinate. system, *e.g.* a water slab (see Engin et al. J. Chem. Phys. B 2010). -.. mdp:: pull-coord1-groups - - The two groups indices should be given on which this pull - coordinate will operate. The first index can be 0, in which case an - absolute reference of :mdp:`pull-coord1-origin` is used. With an - absolute reference the system is no longer translation invariant - and one should think about what to do with the center of mass - motion. Note that (only) for :mdp:`pull-coord1-geometry` = - :mdp-value:`direction-relative` four groups are required. - .. mdp:: pull-coord1-type: .. mdp-value:: umbrella @@ -1780,6 +1770,16 @@ applicable pulling coordinate. component. This geometry is not supported with constraint pulling. +.. mdp:: pull-coord1-groups + + The two groups indices should be given on which this pull + coordinate will operate. The first index can be 0, in which case an + absolute reference of :mdp:`pull-coord1-origin` is used. With an + absolute reference the system is no longer translation invariant + and one should think about what to do with the center of mass + motion. Note that (only) for :mdp:`pull-coord1-geometry` = + :mdp-value:`direction-relative` four groups are required. + .. mdp:: pull-coord1-dim (Y Y Y) diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index e8de945b94..3986ed0562 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -216,9 +216,20 @@ enum { }; enum { - edlbAUTO, edlbNO, edlbYES, edlbNR + edlbsOffForever, /* DLB is off and will never be turned on */ + edlbsOffCanTurnOn, /* DLB is off and will turn on with imbalance */ + edlbsOffTemporarilyLocked, /* DLB is off and temporarily can not turn on */ + edlbsOn, /* DLB is on and will stay on forever */ + edlbsNR }; -const char *edlb_names[edlbNR] = { "auto", "no", "yes" }; +/* Allowed DLB state transitions: + * edlbsOffCanTurnOn -> edlbsOn + * edlbsOffCanTurnOn -> edlbsOffForever + * edlbsOffCanTurnOn -> edlbsOffTemporarilyLocked + * edlbsOffTemporarilyLocked -> edlbsOffCanTurnOn + */ + +const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" }; typedef struct { @@ -297,14 +308,10 @@ typedef struct gmx_domdec_comm t_blocka *cglink; char *bLocalCG; - /* The DLB option */ - int eDLB; - /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */ - gmx_bool bDLB_locked; - /* With eDLB=edlbAUTO, should we check if to DLB on at the next DD? */ + /* The DLB state, possible values are defined above */ + int dlbState; + /* With dlbState=edlbsOffCanTurnOn, should we check if to DLB on at the next DD? */ gmx_bool bCheckWhetherToTurnDlbOn; - /* Are we actually using DLB? */ - gmx_bool bDynLoadBal; /* Cell sizes for static load balancing, first index cartesian */ real **slb_frac; @@ -587,6 +594,11 @@ t_block *dd_charge_groups_global(gmx_domdec_t *dd) return &dd->comm->cgs_gl; } +static bool dlbIsOn(const gmx_domdec_comm_t *comm) +{ + return (comm->dlbState == edlbsOn); +} + static void vec_rvec_init(vec_rvec_t *v) { v->nalloc = 0; @@ -665,7 +677,7 @@ void dd_get_ns_ranges(gmx_domdec_t *dd, int icg, dim = dd->dim[d]; shift0[dim] = zones->izone[izone].shift0[dim]; shift1[dim] = zones->izone[izone].shift1[dim]; - if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0)) + if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0)) { /* A conservative approach, this can be optimized */ shift0[dim] -= 1; @@ -3219,7 +3231,7 @@ static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox, } } - if (!comm->bDynLoadBal) + if (!dlbIsOn(comm)) { copy_rvec(cellsize_min, comm->cellsize_min); } @@ -3815,7 +3827,7 @@ static void set_dd_cell_sizes(gmx_domdec_t *dd, copy_rvec(comm->cell_x0, comm->old_cell_x0); copy_rvec(comm->cell_x1, comm->old_cell_x1); - if (comm->bDynLoadBal) + if (dlbIsOn(comm)) { if (DDMASTER(dd)) { @@ -3856,7 +3868,7 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd, /* Without PBC we don't have restrictions on the outer cells */ if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) && - comm->bDynLoadBal && + dlbIsOn(comm) && (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] < comm->cellsize_min[dim]) { @@ -3870,11 +3882,11 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd, } } - if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM) + if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM) { /* Communicate the boundaries and update cell_ns_x0/1 */ dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1); - if (dd->bGridJump && dd->ndim > 1) + if (dlbIsOn(dd->comm) && dd->ndim > 1) { check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE); } @@ -5000,7 +5012,7 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step, /* Check which direction this cg should go */ for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++) { - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { /* The cell boundaries for dimension d2 are not equal * for each cell row of the lower dimension(s), @@ -5299,7 +5311,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle) (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0)) { load = &comm->load[d]; - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { cell_frac = comm->cell_f1[d] - comm->cell_f0[d]; } @@ -5308,7 +5320,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle) { sbuf[pos++] = dd_force_load(comm); sbuf[pos++] = sbuf[0]; - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { sbuf[pos++] = sbuf[0]; sbuf[pos++] = cell_frac; @@ -5328,7 +5340,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle) { sbuf[pos++] = comm->load[d+1].sum; sbuf[pos++] = comm->load[d+1].max; - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { sbuf[pos++] = comm->load[d+1].sum_m; sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac; @@ -5357,7 +5369,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle) if (dd->ci[dim] == dd->master_ci[dim]) { /* We are the root, process this row */ - if (comm->bDynLoadBal) + if (dlbIsOn(comm)) { root = comm->root[d]; } @@ -5374,7 +5386,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle) load->sum += load->load[pos++]; load->max = std::max(load->max, load->load[pos]); pos++; - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { if (root->bLimited) { @@ -5408,7 +5420,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle) pos++; } } - if (comm->bDynLoadBal && root->bLimited) + if (dlbIsOn(comm) && root->bLimited) { load->sum_m *= dd->nc[dim]; load->flags |= (1<load_step += comm->cycl[ddCyclStep]; comm->load_sum += comm->load[0].sum; comm->load_max += comm->load[0].max; - if (comm->bDynLoadBal) + if (dlbIsOn(comm)) { for (d = 0; d < dd->ndim; d++) { @@ -5492,7 +5504,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd) fprintf(stderr, "%s", buf); } bLim = FALSE; - if (comm->bDynLoadBal) + if (dlbIsOn(comm)) { sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:"); for (d = 0; d < dd->ndim; d++) @@ -5535,7 +5547,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd) sprintf(buf, "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n" " in the domain decomposition.\n", lossf*100); - if (!comm->bDynLoadBal) + if (!dlbIsOn(comm)) { sprintf(buf+strlen(buf), " You might want to use dynamic load balancing (option -dlb.)\n"); } @@ -5580,6 +5592,9 @@ static float dd_f_imbal(gmx_domdec_t *dd) float dd_pme_f_ratio(gmx_domdec_t *dd) { + /* Should only be called on the DD master rank */ + assert(DDMASTER(dd)); + if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0) { return dd->comm->load[0].pme/dd->comm->load[0].mdf; @@ -5610,7 +5625,7 @@ static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step) fprintf(fplog, "\n"); } fprintf(fplog, "DD step %s", gmx_step_str(step, buf)); - if (dd->comm->bDynLoadBal) + if (dlbIsOn(dd->comm)) { fprintf(fplog, " vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' '); @@ -5628,7 +5643,7 @@ static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step) static void dd_print_load_verbose(gmx_domdec_t *dd) { - if (dd->comm->bDynLoadBal) + if (dlbIsOn(dd->comm)) { fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' '); @@ -5669,7 +5684,7 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc) if (bPartOfGroup) { dd->comm->mpi_comm_load[dim_ind] = c_row; - if (dd->comm->eDLB != edlbNO) + if (dd->comm->dlbState != edlbsOffForever) { if (dd->ci[dim] == dd->master_ci[dim]) { @@ -5963,7 +5978,7 @@ void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd) } } - if (dd->comm->eDLB != edlbNO) + if (dd->comm->dlbState != edlbsOffForever) { snew(dd->comm->root, dd->ndim); } @@ -6540,60 +6555,60 @@ static int check_dlb_support(FILE *fplog, t_commrec *cr, const char *dlb_opt, gmx_bool bRecordLoad, unsigned long Flags, t_inputrec *ir) { - int eDLB = -1; + int dlbState = -1; char buf[STRLEN]; switch (dlb_opt[0]) { - case 'a': eDLB = edlbAUTO; break; - case 'n': eDLB = edlbNO; break; - case 'y': eDLB = edlbYES; break; + case 'a': dlbState = edlbsOffCanTurnOn; break; + case 'n': dlbState = edlbsOffForever; break; + case 'y': dlbState = edlbsOn; break; default: gmx_incons("Unknown dlb_opt"); } if (Flags & MD_RERUN) { - return edlbNO; + return edlbsOffForever; } if (!EI_DYNAMICS(ir->eI)) { - if (eDLB == edlbYES) + if (dlbState == edlbsOn) { sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI)); dd_warning(cr, fplog, buf); } - return edlbNO; + return edlbsOffForever; } if (!bRecordLoad) { dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n"); - return edlbNO; + return edlbsOffForever; } if (Flags & MD_REPRODUCIBLE) { - switch (eDLB) + switch (dlbState) { - case edlbNO: + case edlbsOffForever: break; - case edlbAUTO: + case edlbsOffCanTurnOn: dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n"); - eDLB = edlbNO; + dlbState = edlbsOffForever; break; - case edlbYES: + case edlbsOn: dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n"); break; default: - gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB); + gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState); break; } } - return eDLB; + return dlbState; } static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd) @@ -6739,16 +6754,14 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, /* Initialize to GPU share count to 0, might change later */ comm->nrank_gpu_shared = 0; - comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); - comm->bDLB_locked = FALSE; + comm->dlbState = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); comm->bCheckWhetherToTurnDlbOn = TRUE; - comm->bDynLoadBal = (comm->eDLB == edlbYES); if (fplog) { - fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]); + fprintf(fplog, "Dynamic load balancing: %s\n", + edlbs_names[comm->dlbState]); } - dd->bGridJump = comm->bDynLoadBal; comm->bPMELoadBalDLBLimits = FALSE; if (comm->nstSortCG) @@ -6941,7 +6954,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, /* We need to choose the optimal DD grid and possibly PME nodes */ limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox, - comm->eDLB != edlbNO, dlb_scale, + comm->dlbState != edlbsOffForever, dlb_scale, comm->cellsize_limit, comm->cutoff, comm->bInterCGBondeds); @@ -6950,7 +6963,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, bC = (dd->bInterCGcons && rconstr > r_bonded_limit); sprintf(buf, "Change the number of ranks or mdrun option %s%s%s", !bC ? "-rdd" : "-rcon", - comm->eDLB != edlbNO ? " or -dds" : "", + comm->dlbState != edlbsOffForever ? " or -dds" : "", bC ? " or your LINCS settings" : ""); gmx_fatal_collective(FARGS, cr, NULL, @@ -7045,7 +7058,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, *npme_y = comm->npmenodes_y; snew(comm->slb_frac, DIM); - if (comm->eDLB == edlbNO) + if (comm->dlbState == edlbsOffForever) { comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex); comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey); @@ -7054,7 +7067,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, if (comm->bInterCGBondeds && comm->cutoff_mbody == 0) { - if (comm->bBondComm || comm->eDLB != edlbNO) + if (comm->bBondComm || comm->dlbState != edlbsOffForever) { /* Set the bonded communication distance to halfway * the minimum and the maximum, @@ -7062,7 +7075,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, */ acs = average_cellsize_min(dd, ddbox); comm->cutoff_mbody = 0.5*(r_bonded + acs); - if (comm->eDLB != edlbNO) + if (comm->dlbState != edlbsOffForever) { /* Check if this does not limit the scaling */ comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs); @@ -7153,14 +7166,13 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step) dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n"); /* Change DLB from "auto" to "no". */ - comm->eDLB = edlbNO; + comm->dlbState = edlbsOffForever; return; } dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n"); - comm->bDynLoadBal = TRUE; - dd->bGridJump = TRUE; + comm->dlbState = edlbsOn; set_dlb_limits(dd); @@ -7331,7 +7343,7 @@ static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd, std::max(comm->cutoff, comm->cutoff_mbody)); fprintf(fplog, "%40s %-7s %6.3f nm\n", "multi-body bonded interactions", "(-rdd)", - (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit)); + (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit)); } if (dd->vsite_comm) { @@ -7450,7 +7462,7 @@ static void set_cell_limits_dlb(gmx_domdec_t *dd, { comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit); } - if (comm->bDynLoadBal) + if (dlbIsOn(comm)) { set_dlb_limits(dd); } @@ -7509,13 +7521,13 @@ void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale, { fprintf(debug, "The DD cut-off is %f\n", comm->cutoff); } - if (comm->eDLB != edlbNO) + if (comm->dlbState != edlbsOffForever) { set_cell_limits_dlb(dd, dlb_scale, ir, ddbox); } - print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox); - if (comm->eDLB == edlbAUTO) + print_dd_settings(fplog, dd, ir, dlbIsOn(comm), dlb_scale, ddbox); + if (comm->dlbState == edlbsOffCanTurnOn) { if (fplog) { @@ -7571,7 +7583,7 @@ static gmx_bool test_dd_cutoff(t_commrec *cr, np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]); - if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim && + if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim && dd->comm->cd[d].np_dlb > 0) { if (np > dd->comm->cd[d].np_dlb) @@ -7590,12 +7602,12 @@ static gmx_bool test_dd_cutoff(t_commrec *cr, } } - if (dd->comm->eDLB != edlbNO) + if (dd->comm->dlbState != edlbsOffForever) { /* If DLB is not active yet, we don't need to check the grid jumps. * Actually we shouldn't, because then the grid jump data is not set. */ - if (dd->comm->bDynLoadBal && + if (dlbIsOn(dd->comm) && check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE)) { LocallyLimited = 1; @@ -7655,7 +7667,7 @@ void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff) */ static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue) { - if (dd->comm->eDLB == edlbAUTO && !dd_dlb_is_locked(dd)) + if (dd->comm->dlbState == edlbsOffCanTurnOn) { dd->comm->bCheckWhetherToTurnDlbOn = bValue; } @@ -7668,7 +7680,7 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd) { const int nddp_chk_dlb = 100; - if (dd->comm->eDLB != edlbAUTO) + if (dd->comm->dlbState != edlbsOffCanTurnOn) { return FALSE; } @@ -7695,30 +7707,30 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd) gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd) { - return dd->comm->bDynLoadBal; + return (dd->comm->dlbState == edlbsOn); } gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd) { - return dd->comm->bDLB_locked; + return (dd->comm->dlbState == edlbsOffTemporarilyLocked); } void dd_dlb_lock(gmx_domdec_t *dd) { /* We can only lock the DLB when it is set to auto, otherwise don't do anything */ - if (dd->comm->eDLB == edlbAUTO) + if (dd->comm->dlbState == edlbsOffCanTurnOn) { - dd->comm->bDLB_locked = TRUE; + dd->comm->dlbState = edlbsOffTemporarilyLocked; } } void dd_dlb_unlock(gmx_domdec_t *dd) { /* We can only lock the DLB when it is set to auto, otherwise don't do anything */ - if (dd->comm->eDLB == edlbAUTO) + if (dd->comm->dlbState == edlbsOffTemporarilyLocked) { - dd->comm->bDLB_locked = FALSE; - dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, !dd->comm->bDynLoadBal); + dd->comm->dlbState = edlbsOffCanTurnOn; + dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE); } } @@ -7883,7 +7895,7 @@ set_dd_corners(const gmx_domdec_t *dd, c->c[1][0] = comm->cell_x0[dim1]; /* All rows can see this row */ c->c[1][1] = comm->cell_x0[dim1]; - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0); if (bDistMB) @@ -7902,7 +7914,7 @@ set_dd_corners(const gmx_domdec_t *dd, { c->c[2][j] = comm->cell_x0[dim2]; } - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { /* Use the maximum of the i-cells that see a j-cell */ for (i = 0; i < zones->nizone; i++) @@ -7937,7 +7949,7 @@ set_dd_corners(const gmx_domdec_t *dd, */ c->cr1[0] = comm->cell_x1[dim1]; c->cr1[3] = comm->cell_x1[dim1]; - if (dd->bGridJump) + if (dlbIsOn(dd->comm)) { c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1); if (bDistMB) @@ -8276,7 +8288,7 @@ static void setup_dd_communication(gmx_domdec_t *dd, bBondComm = comm->bBondComm; /* Do we need to determine extra distances for multi-body bondeds? */ - bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1); + bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1); /* Do we need to determine extra distances for only two-body bondeds? */ bDist2B = (bBondComm && !bDistMB); @@ -8708,7 +8720,7 @@ static void set_zones_size(gmx_domdec_t *dd, zones = &comm->zones; /* Do we need to determine extra distances for multi-body bondeds? */ - bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1); + bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1); for (z = zone_start; z < zone_end; z++) { @@ -8728,7 +8740,7 @@ static void set_zones_size(gmx_domdec_t *dd, /* With a staggered grid we have different sizes * for non-shifted dimensions. */ - if (dd->bGridJump && zones->shift[z][dim] == 0) + if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0) { if (d == 1) { @@ -8757,7 +8769,7 @@ static void set_zones_size(gmx_domdec_t *dd, if (zones->shift[z][dim] > 0) { dim = dd->dim[d]; - if (!dd->bGridJump || d == 0) + if (!dlbIsOn(dd->comm) || d == 0) { zones->size[z].x0[dim] = comm->cell_x1[dim]; zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs; @@ -9469,7 +9481,7 @@ void dd_partition_system(FILE *fplog, bNStGlobalComm = (step % nstglobalcomm == 0); - if (!comm->bDynLoadBal) + if (!dlbIsOn(comm)) { bDoDLB = FALSE; } @@ -9630,7 +9642,7 @@ void dd_partition_system(FILE *fplog, set_ddbox(dd, bMasterState, cr, ir, state_local->box, TRUE, &top_local->cgs, state_local->x, &ddbox); - bRedist = comm->bDynLoadBal; + bRedist = dlbIsOn(comm); } else { diff --git a/src/gromacs/domdec/domdec_specatomcomm.cpp b/src/gromacs/domdec/domdec_specatomcomm.cpp index abc52ba315..75d6a2e3d1 100644 --- a/src/gromacs/domdec/domdec_specatomcomm.cpp +++ b/src/gromacs/domdec/domdec_specatomcomm.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014, by the GROMACS development team, led by + * Copyright (c) 2006,2007,2008,2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -603,7 +603,7 @@ int setup_specat_communication(gmx_domdec_t *dd, dd->ci[XX], dd->ci[YY], dd->ci[ZZ], nrecv_local, ireq->n, specat_type, specat_type, add_err, - dd->bGridJump ? " or use the -rcon option of mdrun" : ""); + dd_dlb_is_on(dd) ? " or use the -rcon option of mdrun" : ""); } spac->at_start = at_start; diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp index f8facc5621..089361143a 100644 --- a/src/gromacs/ewald/pme-load-balancing.cpp +++ b/src/gromacs/ewald/pme-load-balancing.cpp @@ -934,11 +934,17 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, { pme_lb->bBalance = dd_dlb_is_on(cr->dd); } - else + /* We should ignore the first timing to avoid timing allocation + * overhead. And since the PME load balancing is called just + * before DD repartitioning, the ratio returned by dd_pme_f_ratio + * is not over the last nstlist steps, but the nstlist steps before + * that. So the first useful ratio is available at step_rel=3*nstlist. + */ + else if (step_rel >= 3*ir->nstlist) { if (DDMASTER(cr->dd)) { - /* PME node load is too high, start tuning */ + /* If PME rank load is too high, start tuning */ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); } @@ -1019,7 +1025,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, } if (!pme_lb->bBalance && - (!pme_lb->bSepPMERanks || (step_rel <= pme_lb->step_rel_stop))) + (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop)) { /* We have just deactivated the balancing and we're not measuring PP/PME * imbalance during the first steps of the run: deactivate the tuning. diff --git a/src/gromacs/gmxana/gmx_dos.c b/src/gromacs/gmxana/gmx_dos.c index 7592091e9b..d4423adc47 100644 --- a/src/gromacs/gmxana/gmx_dos.c +++ b/src/gromacs/gmxana/gmx_dos.c @@ -56,6 +56,7 @@ #include "gromacs/math/units.h" #include "gromacs/math/utilities.h" #include "gromacs/math/vec.h" +#include "gromacs/topology/index.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/futil.h" #include "gromacs/utility/smalloc.h" @@ -65,6 +66,40 @@ enum { VACF, MVACF, DOS, DOS_SOLID, DOS_DIFF, DOS_CP, DOS_S, DOS_A, DOS_E, DOS_NR }; +static int calcMoleculesInIndexGroup(t_block *mols, int natoms, atom_id *index, int nindex) +{ + int i = 0; + int mol = 0; + int nMol = 0; + int j; + + while (i < nindex) + { + while (index[i] > mols->index[mol]) + { + mol++; + if (mol >= mols->nr) + { + gmx_fatal(FARGS, "Atom index out of range: %d", index[i]+1); + } + } + for (j = mols->index[mol]; j < mols->index[mol+1]; j++) + { + if (index[i] != j) + { + gmx_fatal(FARGS, "The index group does not consist of whole molecules"); + } + i++; + if (i == natoms) + { + gmx_fatal(FARGS, "Index contains atom numbers larger than the topology"); + } + } + nMol++; + } + return nMol; +} + static double FD(double Delta, double f) { return (2*pow(Delta, -4.5)*pow(f, 7.5) - @@ -214,47 +249,6 @@ static real wEsolid(real nu, real beta) } } -static void dump_fy(output_env_t oenv, real toler) -{ - FILE *fp; - double Delta, f, y, DD; - const char *leg[] = { "f", "fy", "y" }; - - DD = pow(10.0, 0.125); - fp = xvgropen("fy.xvg", "Fig. 2, Lin2003a", "Delta", "y or fy", oenv); - xvgr_legend(fp, asize(leg), leg, oenv); - if (output_env_get_print_xvgr_codes(oenv)) - { - fprintf(fp, "@ world 1e-05, 0, 1000, 1\n"); - fprintf(fp, "@ xaxes scale Logarithmic\n"); - } - for (Delta = 1e-5; (Delta <= 1000); Delta *= DD) - { - f = calc_fluidicity(Delta, toler); - y = calc_y(f, Delta, toler); - fprintf(fp, "%10g %10g %10g %10g\n", Delta, f, f*y, y); - } - xvgrclose(fp); -} - -static void dump_w(output_env_t oenv, real beta) -{ - FILE *fp; - double nu; - const char *leg[] = { "wCv", "wS", "wA", "wE" }; - - fp = xvgropen("w.xvg", "Fig. 1, Berens1983a", "\\f{12}b\\f{4}h\\f{12}n", - "w", oenv); - xvgr_legend(fp, asize(leg), leg, oenv); - for (nu = 1; (nu < 100); nu += 0.05) - { - fprintf(fp, "%10g %10g %10g %10g %10g\n", beta*PLANCK*nu, - wCsolid(nu, beta), wSsolid(nu, beta), - wAsolid(nu, beta), wEsolid(nu, beta)); - } - xvgrclose(fp); -} - int gmx_dos(int argc, char *argv[]) { const char *desc[] = { @@ -264,6 +258,11 @@ int gmx_dos(int argc, char *argv[]) "all vibrations. For flexible systems that would be around a few fs", "between saving. Properties based on the DoS are printed on the", "standard output." + "Note that the density of states is calculated from the mass-weighted", + "autocorrelation, and by default only from the square of the real", + "component rather than absolute value. This means the shape can differ", + "substantially from the plain vibrational power spectrum you can", + "calculate with gmx velacc." }; const char *bugs[] = { "This program needs a lot of memory: total usage equals the number of atoms times 3 times number of frames times 4 (or 8 when run in double precision)." @@ -284,10 +283,16 @@ int gmx_dos(int argc, char *argv[]) gmx_fft_t fft; double cP, S, A, E, DiffCoeff, Delta, f, y, z, sigHS, Shs, Sig, DoS0, recip_fac; double wCdiff, wSdiff, wAdiff, wEdiff; - - static gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalize = FALSE; - static gmx_bool bRecip = FALSE, bDump = FALSE; + int grpNatoms; + atom_id *index; + char *grpname; + double invNormalize; + gmx_bool normalizeAutocorrelation; + + static gmx_bool bVerbose = TRUE, bAbsolute = FALSE, bNormalizeDos = FALSE; + static gmx_bool bRecip = FALSE; static real Temp = 298.15, toler = 1e-6; + t_pargs pa[] = { { "-v", FALSE, etBOOL, {&bVerbose}, "Be loud and noisy." }, @@ -295,14 +300,12 @@ int gmx_dos(int argc, char *argv[]) "Use cm^-1 on X-axis instead of 1/ps for DoS plots." }, { "-abs", FALSE, etBOOL, {&bAbsolute}, "Use the absolute value of the Fourier transform of the VACF as the Density of States. Default is to use the real component only" }, - { "-normdos", FALSE, etBOOL, {&bNormalize}, - "Normalize the DoS such that it adds up to 3N. This is a hack that should not be necessary." }, + { "-normdos", FALSE, etBOOL, {&bNormalizeDos}, + "Normalize the DoS such that it adds up to 3N. This should usually not be necessary." }, { "-T", FALSE, etREAL, {&Temp}, "Temperature in the simulation" }, { "-toler", FALSE, etREAL, {&toler}, - "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" }, - { "-dump", FALSE, etBOOL, {&bDump}, - "[HIDDEN]Dump the y/fy plot corresponding to Fig. 2 inLin2003a and the and the weighting functions corresponding to Fig. 1 in Berens1983a." } + "[HIDDEN]Tolerance when computing the fluidicity using bisection algorithm" } }; t_filenm fnm[] = { @@ -331,30 +334,26 @@ int gmx_dos(int argc, char *argv[]) } beta = 1/(Temp*BOLTZ); - if (bDump) - { - printf("Dumping reference figures. Thanks for your patience.\n"); - dump_fy(oenv, toler); - dump_w(oenv, beta); - exit(0); - } fplog = gmx_fio_fopen(ftp2fn(efLOG, NFILE, fnm), "w"); fprintf(fplog, "Doing density of states analysis based on trajectory.\n"); please_cite(fplog, "Pascal2011a"); please_cite(fplog, "Caleman2011b"); - read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box, - TRUE); + read_tps_conf(ftp2fn(efTPR, NFILE, fnm), title, &top, &ePBC, NULL, NULL, box, TRUE); + + /* Handle index groups */ + get_index(&top.atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &grpNatoms, &index, &grpname); + V = det(box); tmass = 0; - for (i = 0; (i < top.atoms.nr); i++) + for (i = 0; i < grpNatoms; i++) { - tmass += top.atoms.atom[i].m; + tmass += top.atoms.atom[index[i]].m; } - Natom = top.atoms.nr; - Nmol = top.mols.nr; + Natom = grpNatoms; + Nmol = calcMoleculesInIndexGroup(&top.mols, top.atoms.nr, index, grpNatoms); gnx = Natom*DIM; /* Correlation stuff */ @@ -390,9 +389,9 @@ int gmx_dos(int argc, char *argv[]) } for (i = 0; i < gnx; i += DIM) { - c1[i+XX][nframes] = fr.v[i/DIM][XX]; - c1[i+YY][nframes] = fr.v[i/DIM][YY]; - c1[i+ZZ][nframes] = fr.v[i/DIM][ZZ]; + c1[i+XX][nframes] = fr.v[index[i/DIM]][XX]; + c1[i+YY][nframes] = fr.v[index[i/DIM]][YY]; + c1[i+ZZ][nframes] = fr.v[index[i/DIM]][ZZ]; } t1 = fr.time; @@ -413,6 +412,21 @@ int gmx_dos(int argc, char *argv[]) printf("Going to do %d fourier transforms of length %d. Hang on.\n", gnx, nframes); } + /* Unfortunately the -normalize program option for the autocorrelation + * function calculation is added as a hack with a static variable in the + * autocorrelation.c source. That would work if we called the normal + * do_autocorr(), but this routine overrides that by directly calling + * the low-level functionality. That unfortunately leads to ignoring the + * default value for the option (which is to normalize). + * Since the absolute value seems to be important for the subsequent + * analysis below, we detect the value directly from the option, calculate + * the autocorrelation without normalization, and then apply the + * normalization just to the autocorrelation output + * (or not, if the user asked for a non-normalized autocorrelation). + */ + normalizeAutocorrelation = opt2parg_bool("-normalize", npargs, ppa); + + /* Note that we always disable normalization here, regardless of user settings */ low_do_autocorr(NULL, oenv, NULL, nframes, gnx, nframes, c1, dt, eacNormal, 0, FALSE, FALSE, FALSE, -1, -1, 0); snew(dos, DOS_NR); @@ -427,7 +441,7 @@ int gmx_dos(int argc, char *argv[]) } for (i = 0; (i < gnx); i += DIM) { - mi = top.atoms.atom[i/DIM].m; + mi = top.atoms.atom[index[i/DIM]].m; for (j = 0; (j < nframes/2); j++) { c1j = (c1[i+XX][j] + c1[i+YY][j] + c1[i+ZZ][j]); @@ -435,20 +449,28 @@ int gmx_dos(int argc, char *argv[]) dos[MVACF][j] += mi*c1j; } } - fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity ACF", + + fp = xvgropen(opt2fn("-vacf", NFILE, fnm), "Velocity autocorrelation function", "Time (ps)", "C(t)", oenv); snew(tt, nframes/2); + + invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0; + for (j = 0; (j < nframes/2); j++) { tt[j] = j*dt; - fprintf(fp, "%10g %10g\n", tt[j], dos[VACF][j]); + fprintf(fp, "%10g %10g\n", tt[j], dos[VACF][j] * invNormalize); } xvgrclose(fp); - fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity ACF", + + fp = xvgropen(opt2fn("-mvacf", NFILE, fnm), "Mass-weighted velocity autocorrelation function", "Time (ps)", "C(t)", oenv); + + invNormalize = normalizeAutocorrelation ? 1.0/dos[VACF][0] : 1.0; + for (j = 0; (j < nframes/2); j++) { - fprintf(fp, "%10g %10g\n", tt[j], dos[MVACF][j]); + fprintf(fp, "%10g %10g\n", tt[j], dos[MVACF][j] * invNormalize); } xvgrclose(fp); @@ -483,7 +505,7 @@ int gmx_dos(int argc, char *argv[]) } /* Normalize it */ dostot = evaluate_integral(nframes/4, nu, dos[DOS], NULL, nframes/4, &stddev); - if (bNormalize) + if (bNormalizeDos) { for (j = 0; (j < nframes/4); j++) { @@ -571,18 +593,6 @@ int gmx_dos(int argc, char *argv[]) cP = BOLTZ * evaluate_integral(nframes/4, nu, dos[DOS_CP], NULL, nframes/4, &stddev); fprintf(fplog, "Heat capacity %g J/mol K\n", 1000*cP/Nmol); - - /* - S = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_S],NULL, - nframes/4,&stddev); - fprintf(fplog,"Entropy %g J/mol K\n",1000*S/Nmol); - A = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_A],NULL, - nframes/4,&stddev); - fprintf(fplog,"Helmholtz energy %g kJ/mol\n",A/Nmol); - E = BOLTZ * evaluate_integral(nframes/4,nu,dos[DOS_E],NULL, - nframes/4,&stddev); - fprintf(fplog,"Internal energy %g kJ/mol\n",E/Nmol); - */ fprintf(fplog, "\nArrivederci!\n"); gmx_fio_fclose(fplog); diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.cpp b/src/gromacs/gmxlib/gmx_detect_hardware.cpp index 5e4733df1d..e93fe43d10 100644 --- a/src/gromacs/gmxlib/gmx_detect_hardware.cpp +++ b/src/gromacs/gmxlib/gmx_detect_hardware.cpp @@ -77,29 +77,40 @@ #ifdef GMX_GPU -const gmx_bool bGPUBinary = TRUE; + +static const bool bGPUBinary = TRUE; + # ifdef GMX_USE_OPENCL -const char *gpu_implementation = "OpenCL"; + +static const char *gpu_implementation = "OpenCL"; /* Our current OpenCL implementation only supports using exactly one * GPU per PP rank, so sharing is impossible */ -const gmx_bool bGpuSharingSupported = FALSE; +static const bool bGpuSharingSupported = false; /* Our current OpenCL implementation is not known to handle * concurrency correctly (at context creation, JIT compilation, or JIT * cache-management stages). OpenCL runtimes need not support it * either; library MPI segfaults when creating OpenCL contexts; * thread-MPI seems to work but is not yet known to be safe. */ -const gmx_bool bMultiGpuPerNodeSupported = FALSE; -# else -const char *gpu_implementation = "CUDA"; -const gmx_bool bGpuSharingSupported = TRUE; -const gmx_bool bMultiGpuPerNodeSupported = TRUE; -# endif -#else -const gmx_bool bGPUBinary = FALSE; -const char *gpu_implementation = "non-GPU"; -const gmx_bool bGpuSharingSupported = FALSE; -const gmx_bool bMultiGpuPerNodeSupported = FALSE; -#endif +static const bool bMultiGpuPerNodeSupported = false; + +# else /* GMX_USE_OPENCL */ + +// Our CUDA implementation supports everything +static const char *gpu_implementation = "CUDA"; +static const bool bGpuSharingSupported = true; +static const bool bMultiGpuPerNodeSupported = true; + +# endif /* GMX_USE_OPENCL */ + +#else /* GMX_GPU */ + +// Not compiled with GPU support +static const bool bGPUBinary = false; +static const char *gpu_implementation = "non-GPU"; +static const bool bGpuSharingSupported = false; +static const bool bMultiGpuPerNodeSupported = false; + +#endif /* GMX_GPU */ /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */ const char * const gpu_detect_res_str[egpuNR] = @@ -124,6 +135,16 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank); static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info, const gmx_gpu_opt_t *gpu_opt); +gmx_bool gmx_multiple_gpu_per_node_supported() +{ + return bMultiGpuPerNodeSupported; +} + +gmx_bool gmx_gpu_sharing_supported() +{ + return bGpuSharingSupported; +} + static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info) { int i, ndev; @@ -440,7 +461,8 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, } else { - if (ngpu_comp > npppn) + /* TODO Should we have a gpu_opt->n_dev_supported field? */ + if (ngpu_comp > npppn && gmx_multiple_gpu_per_node_supported()) { md_print_warn(cr, fplog, "NOTE: potentially sub-optimal launch configuration, %s started with less\n" @@ -460,13 +482,26 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, */ if (cr->rank_pp_intranode == 0) { + std::string reasonForLimit; + if (ngpu_comp > 1 && + ngpu_use == 1 && + !gmx_multiple_gpu_per_node_supported()) + { + reasonForLimit = "can be used by "; + reasonForLimit += gpu_implementation; + reasonForLimit += " in GROMACS"; + } + else + { + reasonForLimit = "was detected"; + } gmx_fatal(FARGS, "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n" - "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.", + "%s was started with %d PP %s%s%s, but only %d GPU%s %s.", th_or_proc, btMPI ? "s" : "es", pernode, ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, - ngpu_use, gpu_use_plural); + ngpu_use, gpu_use_plural, reasonForLimit.c_str()); } } } @@ -552,7 +587,7 @@ static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info, { int device_id; - device_id = bGpuSharingSupported ? get_gpu_device_id(gpu_info, gpu_opt, i) : i; + device_id = gmx_gpu_sharing_supported() ? get_gpu_device_id(gpu_info, gpu_opt, i) : i; uniq_ids[device_id] = 1; } /* Count the devices used. */ @@ -1121,11 +1156,11 @@ void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt) parse_digits_from_plain_string(env, &gpu_opt->n_dev_use, &gpu_opt->dev_use); - if (!bMultiGpuPerNodeSupported && 1 < gpu_opt->n_dev_use) + if (!gmx_multiple_gpu_per_node_supported() && 1 < gpu_opt->n_dev_use) { gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", gpu_implementation); } - if (!bGpuSharingSupported && anyGpuIdIsRepeated(gpu_opt)) + if (!gmx_gpu_sharing_supported() && anyGpuIdIsRepeated(gpu_opt)) { gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", gpu_implementation); } @@ -1231,7 +1266,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank) { if (nrank % gpu_opt->n_dev_compatible == 0) { - nshare = bGpuSharingSupported ? nrank/gpu_opt->n_dev_compatible : 1; + nshare = gmx_gpu_sharing_supported() ? nrank/gpu_opt->n_dev_compatible : 1; } else { @@ -1252,7 +1287,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank) /* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */ gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank); - if (!bMultiGpuPerNodeSupported) + if (!gmx_multiple_gpu_per_node_supported()) { gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1); } diff --git a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp index 783016de1a..4e43fd0e1c 100644 --- a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp +++ b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp @@ -750,7 +750,7 @@ ocl_get_build_options_string(cl_context context, for (std::string::size_type i = 0; i < unescaped_ocl_root_path.length(); i++) { - if (inputStr[i] == ' ') + if (unescaped_ocl_root_path[i] == ' ') { ocl_root_path.push_back('\\'); } diff --git a/src/gromacs/gmxpreprocess/readpull.c b/src/gromacs/gmxpreprocess/readpull.c index 49793283cc..f2fb2d2203 100644 --- a/src/gromacs/gmxpreprocess/readpull.c +++ b/src/gromacs/gmxpreprocess/readpull.c @@ -262,12 +262,12 @@ char **read_pullparams(int *ninp_p, t_inpfile **inp_p, int ngroup; pcrd = &pull->coord[i-1]; - sprintf(buf, "pull-coord%d-groups", i); - STYPE(buf, groups, ""); sprintf(buf, "pull-coord%d-type", i); EETYPE(buf, pcrd->eType, epull_names); sprintf(buf, "pull-coord%d-geometry", i); EETYPE(buf, pcrd->eGeom, epullg_names); + sprintf(buf, "pull-coord%d-groups", i); + STYPE(buf, groups, ""); nscan = sscanf(groups, "%d %d %d %d %d", &pcrd->group[0], &pcrd->group[1], &pcrd->group[2], &pcrd->group[3], &idum); ngroup = (pcrd->eGeom == epullgDIRRELATIVE) ? 4 : 2; diff --git a/src/gromacs/legacyheaders/gmx_detect_hardware.h b/src/gromacs/legacyheaders/gmx_detect_hardware.h index 56b0384fe6..338ad8dd81 100644 --- a/src/gromacs/legacyheaders/gmx_detect_hardware.h +++ b/src/gromacs/legacyheaders/gmx_detect_hardware.h @@ -45,6 +45,18 @@ extern "C" { } /* fixes auto-indentation problems */ #endif +/*! \brief Return whether mdrun can use more than one GPU per node + * + * The OpenCL implementation cannot use more than one GPU per node, + * for example. */ +gmx_bool gmx_multiple_gpu_per_node_supported(); + +/*! \brief Return whether PP ranks can share a GPU + * + * The OpenCL implementation cannot share a GPU between ranks, for + * example. */ +gmx_bool gmx_gpu_sharing_supported(); + /* the init and consistency functions depend on commrec that may not be consistent in cuda because MPI types don't exist there. */ #ifndef __CUDACC__ diff --git a/src/gromacs/legacyheaders/types/commrec.h b/src/gromacs/legacyheaders/types/commrec.h index cfc7669a2f..64521b85ae 100644 --- a/src/gromacs/legacyheaders/types/commrec.h +++ b/src/gromacs/legacyheaders/types/commrec.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013,2014, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -161,7 +161,6 @@ struct gmx_domdec_t { ivec nc; int ndim; ivec dim; /* indexed by 0 to ndim */ - gmx_bool bGridJump; /* PBC from dim 0 to npbcdim */ int npbcdim; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h index f6ee458292..e09de482f9 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h @@ -297,16 +297,16 @@ struct gmx_nbnxn_ocl_t cl_command_queue stream[2]; /**< local and non-local GPU queues */ /** events used for synchronization */ - cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel - is done (and the local transfer can proceed) */ - cl_event isc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in - the local stream that need to precede the - non-local force calculations are done - (e.g. f buffer 0-ing, local x/q H2D) */ - - cl_bool bDoTime; /**< True if event-based timing is enabled. */ - cl_timers_t *timers; /**< OpenCL event-based timers. */ - struct gmx_wallclock_gpu_t *timings; /**< Timing data. */ + cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel + is done (and the local transfer can proceed) */ + cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in + the local stream that need to precede the + non-local force calculations are done + (e.g. f buffer 0-ing, local x/q H2D) */ + + cl_bool bDoTime; /**< True if event-based timing is enabled. */ + cl_timers_t *timers; /**< OpenCL event-based timers. */ + struct gmx_wallclock_gpu_t *timings; /**< Timing data. */ }; #ifdef __cplusplus diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c index a372082356..16fd3b0101 100644 --- a/src/gromacs/mdlib/nbnxn_search.c +++ b/src/gromacs/mdlib/nbnxn_search.c @@ -4014,7 +4014,8 @@ static void split_sci_entry(nbnxn_pairlist_t *nbl, nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1; } - if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max) + /* Check if we should split at this cj4 to get a list of size nsp */ + if (nsp > 0 && nsp + nsp_cj4 > nsp_max) { /* Split the list at cj4 */ nbl->sci[sci].cj4_ind_end = cj4; @@ -4379,7 +4380,7 @@ static void get_nsubpair_target(const nbnxn_search_t nbs, ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X); ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y); - ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z); + ls[ZZ] = grid->na_c/(grid->atom_density*ls[XX]*ls[YY]); /* The average squared length of the diagonal of a sub cell */ xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ]; @@ -4409,11 +4410,25 @@ static void get_nsubpair_target(const nbnxn_search_t nbs, /* 4 octants of a sphere */ vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3); + /* Estimate the number of cluster pairs as the local number of + * clusters times the volume they interact with times the density. + */ nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c; /* Subtract the non-local pair count */ nsp_est -= nsp_est_nl; + /* For small cut-offs nsp_est will be an underesimate. + * With DD nsp_est_nl is an overestimate so nsp_est can get negative. + * So to avoid too small or negative nsp_est we set a minimum of + * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14. + * This might be a slight overestimate for small non-periodic groups of + * atoms as will occur for a local domain with DD, but for small + * groups of atoms we'll anyhow be limited by nsubpair_target_min, + * so this overestimation will not matter. + */ + nsp_est = max(nsp_est, grid->nsubc_tot*14.0); + if (debug) { fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n", diff --git a/src/gromacs/mdlib/nsgrid.cpp b/src/gromacs/mdlib/nsgrid.cpp index b92d5a5c76..63649e3bee 100644 --- a/src/gromacs/mdlib/nsgrid.cpp +++ b/src/gromacs/mdlib/nsgrid.cpp @@ -250,7 +250,7 @@ static void set_grid_sizes(matrix box, rvec izones_x0, rvec izones_x1, real rlis * direction has uniform DD cell boundaries. */ bDDRect = !(ddbox->tric_dir[i] || - (dd->bGridJump && i != dd->dim[0])); + (dd_dlb_is_on(dd) && i != dd->dim[0])); radd = rlist; if (i >= ddbox->npbcdim && diff --git a/src/gromacs/trajectoryanalysis/modules/distance.cpp b/src/gromacs/trajectoryanalysis/modules/distance.cpp index 96f752b5d5..4b07d1483a 100644 --- a/src/gromacs/trajectoryanalysis/modules/distance.cpp +++ b/src/gromacs/trajectoryanalysis/modules/distance.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by + * Copyright (c) 2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -271,7 +271,7 @@ Distance::initAnalysis(const TrajectoryAnalysisSettings &settings, { AnalysisDataPlotModulePointer plotm( new AnalysisDataPlotModule(settings.plotSettings())); - plotm->setFileName(fnAll_); + plotm->setFileName(fnXYZ_); plotm->setTitle("Distance"); plotm->setXAxisIsTime(); plotm->setYLabel("Distance (nm)"); diff --git a/src/programs/mdrun/resource-division.cpp b/src/programs/mdrun/resource-division.cpp index 8378845cc1..9042aee8a9 100644 --- a/src/programs/mdrun/resource-division.cpp +++ b/src/programs/mdrun/resource-division.cpp @@ -53,6 +53,22 @@ #include "gromacs/utility/fatalerror.h" +/* DISCLAIMER: All the atom count and thread numbers below are heuristic. + * The real switching points will depend on the system simulation, + * the algorithms used and the hardware it's running on, as well as if there + * are other jobs running on the same machine. We try to take into account + * factors that have a large influence, such as recent Intel CPUs being + * much better at wide multi-threading. The remaining factors should + * (hopefully) have a small influence, such that the performance just before + * and after a switch point doesn't change too much. + */ + +#ifdef GMX_OPENMP +static const bool bOMP = true; +#else +static const bool bOMP = false; +#endif + #ifdef GMX_THREAD_MPI /* The minimum number of atoms per tMPI thread. With fewer atoms than this, * the number of threads will get lowered. @@ -64,7 +80,7 @@ static const int min_atoms_per_gpu = 900; /* TODO choose nthreads_omp based on hardware topology when we have a hardware topology detection library */ /* First we consider the case of no MPI (1 MPI rank). - * In general, when running up to 4 threads, OpenMP should be faster. + * In general, when running up to 8 threads, OpenMP should be faster. * Note: on AMD Bulldozer we should avoid running OpenMP over two dies. * On Intel>=Nehalem running OpenMP on a single CPU is always faster, * even on two CPUs it's usually faster (but with many OpenMP threads @@ -77,16 +93,16 @@ static const int min_atoms_per_gpu = 900; * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of * model numbers we ensure also future Intel CPUs are covered. */ -const int nthreads_omp_always_faster_default = 6; -const int nthreads_omp_always_faster_Nehalem = 12; -const int nthreads_omp_always_faster_Intel_AVX = 16; +const int nthreads_omp_faster_default = 8; +const int nthreads_omp_faster_Nehalem = 12; +const int nthreads_omp_faster_Intel_AVX = 16; /* For CPU only runs the fastest options are usually MPI or OpenMP only. * With one GPU, using MPI only is almost never optimal, so we need to * compare running pure OpenMP with combined MPI+OpenMP. This means higher * OpenMP threads counts can still be ok. Multiplying the numbers above * by a factor of 2 seems to be a good estimate. */ -const int nthreads_omp_always_faster_gpu_fac = 2; +const int nthreads_omp_faster_gpu_fac = 2; /* This is the case with MPI (2 or more MPI PP ranks). * By default we will terminate with a fatal error when more than 8 @@ -104,34 +120,30 @@ const int nthreads_omp_mpi_ok_min_gpu = 2; const int nthreads_omp_mpi_target_max = 6; -#ifdef GMX_USE_OPENCL -static const bool bGpuSharingSupported = false; -#else -static const bool bGpuSharingSupported = true; -#endif - - -static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU) +/* Returns the maximum OpenMP thread count for which using a single MPI rank + * should be faster than using multiple ranks with the same total thread count. + */ +static int nthreads_omp_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU) { int nth; if (gmx_cpuid_vendor(cpuid_info) == GMX_CPUID_VENDOR_INTEL && gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX)) { - nth = nthreads_omp_always_faster_Intel_AVX; + nth = nthreads_omp_faster_Intel_AVX; } else if (gmx_cpuid_is_intel_nehalem(cpuid_info)) { - nth = nthreads_omp_always_faster_Nehalem; + nth = nthreads_omp_faster_Nehalem; } else { - nth = nthreads_omp_always_faster_default; + nth = nthreads_omp_faster_default; } if (bUseGPU) { - nth *= nthreads_omp_always_faster_gpu_fac; + nth *= nthreads_omp_faster_gpu_fac; } nth = std::min(nth, GMX_OPENMP_MAX_THREADS); @@ -139,6 +151,26 @@ static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU) return nth; } +/* Returns that maximum OpenMP thread count that passes the efficiency check */ +static int nthreads_omp_efficient_max(int gmx_unused nrank, + gmx_cpuid_t cpuid_info, + gmx_bool bUseGPU) +{ +#if defined GMX_OPENMP && defined GMX_MPI + if (nrank > 1) + { + return nthreads_omp_mpi_ok_max; + } + else +#endif + { + return nthreads_omp_faster(cpuid_info, bUseGPU); + } +} + +/* Return the number of thread-MPI ranks to use. + * This is chosen such that we can always obey our own efficiency checks. + */ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt, int nthreads_tot, @@ -161,9 +193,9 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */ nrank = nthreads_tot; } - else if (bGpuSharingSupported && - (nthreads_tot > nthreads_omp_always_faster(hwinfo->cpuid_info, - ngpu > 0) || + else if (gmx_gpu_sharing_supported() && + (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info, + ngpu > 0) || (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))) { /* The high OpenMP thread count will likely result in sub-optimal @@ -193,8 +225,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, } else { - if (nthreads_tot <= nthreads_omp_always_faster(hwinfo->cpuid_info, - ngpu > 0)) + if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0)) { /* Use pure OpenMP parallelization */ nrank = 1; @@ -210,6 +241,34 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, } +static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo, int cutoff_scheme) +{ + /* This code relies on the fact that GPU are not detected when GPU + * acceleration was disabled at run time by the user. + */ + if (cutoff_scheme == ecutsVERLET && + hwinfo->gpu_info.n_dev_compatible > 0) + { + if (gmx_multiple_gpu_per_node_supported()) + { + return hwinfo->gpu_info.n_dev_compatible; + } + else + { + if (hwinfo->gpu_info.n_dev_compatible > 1) + { + md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n"); + } + return 1; + } + } + else + { + return 0; + } +} + + #ifdef GMX_THREAD_MPI /* Get the number of MPI ranks to use for thread-MPI based on how many * were requested, which algorithms we're using, @@ -219,7 +278,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, * with the hardware, except that ntmpi could be larger than #GPU. */ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, - const gmx_hw_opt_t *hw_opt, + gmx_hw_opt_t *hw_opt, const t_inputrec *inputrec, const gmx_mtop_t *mtop, const t_commrec *cr, @@ -227,16 +286,15 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, { int nthreads_hw, nthreads_tot_max, nrank, ngpu; int min_atoms_per_mpi_rank; - gmx_bool bCanUseGPU; /* Check if an algorithm does not support parallel simulation. */ if (inputrec->eI == eiLBFGS || inputrec->coulombtype == eelEWALD) { - md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n"); + md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI rank.\n"); if (hw_opt->nthreads_tmpi > 1) { - gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that"); + gmx_fatal(FARGS, "You asked for more than 1 thread-MPI rank, but an algorithm doesn't support that"); } return 1; @@ -266,16 +324,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, nthreads_tot_max = nthreads_hw; } - bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && - hwinfo->gpu_info.n_dev_compatible > 0); - if (bCanUseGPU) - { - ngpu = hwinfo->gpu_info.n_dev_compatible; - } - else - { - ngpu = 0; - } + ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme); if (inputrec->cutoff_scheme == ecutsGROUP) { @@ -297,7 +346,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, } else { - if (bCanUseGPU) + if (ngpu >= 1) { min_atoms_per_mpi_rank = min_atoms_per_gpu; } @@ -362,6 +411,26 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, nrank = nrank_new; + /* We reduced the number of tMPI ranks, which means we might violate + * our own efficiency checks if we simply use all hardware threads. + */ + if (bOMP && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0) + { + /* The user set neither the total nor the OpenMP thread count, + * we should use all hardware threads, unless we will violate + * our own efficiency limitation on the thread count. + */ + int nt_omp_max; + + nt_omp_max = nthreads_omp_efficient_max(nrank, hwinfo->cpuid_info, ngpu >= 1); + + if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail) + { + /* Limit the number of OpenMP threads to start */ + hw_opt->nthreads_omp = nt_omp_max; + } + } + fprintf(stderr, "\n"); fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n"); fprintf(stderr, " only starting %d thread-MPI ranks.\n", nrank); @@ -375,7 +444,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt, - gmx_bool bNTOptSet, + gmx_bool bNtOmpOptionSet, t_commrec *cr, FILE *fplog) { @@ -392,6 +461,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, * OpenMP have been initialized. Check that here. */ #ifdef GMX_THREAD_MPI + assert(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max); assert(hw_opt->nthreads_tmpi >= 1); #endif assert(gmx_omp_nthreads_get(emntDefault) >= 1); @@ -434,7 +504,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, if (DOMAINDECOMP(cr) && cr->nnodes > 1) { if (nth_omp_max < nthreads_omp_mpi_ok_min || - (!(ngpu > 0 && !bGpuSharingSupported) && + (!(ngpu > 0 && !gmx_gpu_sharing_supported()) && nth_omp_max > nthreads_omp_mpi_ok_max)) { /* Note that we print target_max here, not ok_max */ @@ -443,7 +513,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max); - if (bNTOptSet) + if (bNtOmpOptionSet) { md_print_warn(cr, fplog, "NOTE: %s\n", buf); } @@ -460,8 +530,8 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, else { /* No domain decomposition (or only one domain) */ - if (!(ngpu > 0 && !bGpuSharingSupported) && - nth_omp_max > nthreads_omp_always_faster(hwinfo->cpuid_info, ngpu > 0)) + if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) && + nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0)) { /* To arrive here, the user/system set #ranks and/or #OMPthreads */ gmx_bool bEnvSet; @@ -469,7 +539,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, bEnvSet = (getenv("OMP_NUM_THREADS") != NULL); - if (bNTOptSet || bEnvSet) + if (bNtOmpOptionSet || bEnvSet) { sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max); } @@ -489,7 +559,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, * with different values per rank or node, since in that case * the user can not set -ntomp to override the error. */ - if (bNTOptSet || (bEnvSet && nth_omp_min != nth_omp_max)) + if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max)) { md_print_warn(cr, fplog, "NOTE: %s\n", buf); } @@ -502,7 +572,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, #else /* GMX_OPENMP && GMX_MPI */ /* No OpenMP and/or MPI: it doesn't make much sense to check */ GMX_UNUSED_VALUE(hw_opt); - GMX_UNUSED_VALUE(bNTOptSet); + GMX_UNUSED_VALUE(bNtOmpOptionSet); /* Check if we have more than 1 physical core, if detected, * or more than 1 hardware thread if physical cores were not detected. */ @@ -538,17 +608,18 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, } if (hw_opt->nthreads_tmpi > 0) { - gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI"); + gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI"); } #endif -#ifndef GMX_OPENMP - if (hw_opt->nthreads_omp > 1) + if (!bOMP) { - gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support"); + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support"); + } + hw_opt->nthreads_omp = 1; } - hw_opt->nthreads_omp = 1; -#endif if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0) { @@ -559,14 +630,14 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, hw_opt->nthreads_omp > 0 && hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp) { - gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested", + gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested", hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp); } if (hw_opt->nthreads_tmpi > 0 && hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0) { - gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)", + gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)", hw_opt->nthreads_tot, hw_opt->nthreads_tmpi); } @@ -584,12 +655,10 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, } } -#ifndef GMX_OPENMP - if (hw_opt->nthreads_omp > 1) + if (!bOMP && hw_opt->nthreads_omp > 1) { gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support"); } -#endif if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0) { diff --git a/src/programs/mdrun/resource-division.h b/src/programs/mdrun/resource-division.h index ca98237741..42f4f05182 100644 --- a/src/programs/mdrun/resource-division.h +++ b/src/programs/mdrun/resource-division.h @@ -45,9 +45,10 @@ * At the point we have already called check_and_update_hw_opt. * Thus all options should be internally consistent and consistent * with the hardware, except that ntmpi could be larger than #GPU. + * If necessary, this function will modify hw_opt->nthreads_omp. */ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, - const gmx_hw_opt_t *hw_opt, + gmx_hw_opt_t *hw_opt, const t_inputrec *inputrec, const gmx_mtop_t *mtop, const t_commrec *cr, @@ -58,12 +59,12 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, * intended to catch cases where the user starts 1 MPI rank per hardware * thread or 1 rank per physical node. * With a sub-optimal setup a note is printed to fplog and stderr when - * bNtOptSet==TRUE; with bNtOptSet==FALSE a fatal error is issued. + * bNtOmpSet==TRUE; with bNtOptOptionSet==FALSE a fatal error is issued. * This function should be called after thread-MPI and OpenMP are set up. */ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt, - gmx_bool bNTOptSet, + gmx_bool bNtOmpOptionSet, t_commrec *cr, FILE *fplog); diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index a917c62ec5..f25ad46c9b 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -781,10 +781,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt, hw_opt, inputrec, mtop, cr, fplog); - if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0) - { - hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi; - } if (hw_opt->nthreads_tmpi > 1) {