From: Mark Abraham Date: Wed, 1 Oct 2014 20:02:02 +0000 (+0200) Subject: Merge release-5-0 into master X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=19d3c2e5d0c401eb59010960d11a18b6ba2c54c6;hp=fe90f1c1c71a3a43a27ec9ba76e772ae54786c7f;p=alexxy%2Fgromacs.git Merge release-5-0 into master Conflicts: CMakeLists.txt Version numbers not bumped; fixed to use the right name for RelWithDebInfo. cmake/gmxCFlags.cmake Fixed to use the right name for RelWithDebInfo. src/gromacs/listed-forces/bonded.cpp New RB SIMD function in bonded.cpp had unused variables, now eliminated src/gromacs/mdlib/domdec.cpp Bug fixes from release-5-0 incorporated. std::max now used in code newly arrived from release-5-0. md.cpp had no conflict, but fr->nbv->bUseGPU had to be replaced by use_GPU(fr->nbv) to work in master branch. Change-Id: I65326b691745111fbdaa9435be6c92fa1acf6e7d --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 70185949c4..7fbe947522 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,7 +81,7 @@ if(CMAKE_CONFIGURATION_TYPES) "List of configuration types" FORCE) endif() -set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE) +set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE) set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON) diff --git a/admin/uncrustify.cfg b/admin/uncrustify.cfg index d89351a7e6..f7308115b2 100644 --- a/admin/uncrustify.cfg +++ b/admin/uncrustify.cfg @@ -1575,3 +1575,12 @@ pp_define_at_level = false # false/true # all tokens are separated by any mix of ',' commas, '=' equal signs # and whitespace (space, tab) # + +# Teach uncrustify about the GROMACS attribute aliases that we use +# to hide compiler differences. This means that declarations like +# +# int i, j; +# int nthreads gmx_unused; +# +# does not align i with gmx_unused. +set ATTRIBUTE gmx_unused gmx_inline gmx_restrict diff --git a/cmake/gmxCFlags.cmake b/cmake/gmxCFlags.cmake index 011cada36e..3a0e3bbe15 100644 --- a/cmake/gmxCFlags.cmake +++ b/cmake/gmxCFlags.cmake @@ -64,7 +64,7 @@ function(gmx_set_cmake_compiler_flags) # be set up elsewhere and passed to this function, but it is # inconvenient in CMake to pass more than one list, and such a # list is only used here. - foreach(build_type RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE) + foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE) set(GMXC_${language}FLAGS_${build_type} "${GMXC_${language}FLAGS_RELEASE}") endforeach() # Copy the flags that are only used by the real Release build @@ -85,9 +85,12 @@ function(gmx_set_cmake_compiler_flags) endif() # Append to the variables for the given build type for - # each language, in the parent scope. + # each language, in the parent scope. We add our new variables at the end, so + # compiler-specific choices are more likely to override default CMake choices. + # This is for instance useful for RelWithDebInfo builds, where we want to use the full + # set of our optimization flags detected in this file, rather than having -O2 override them. set(CMAKE_${language}_FLAGS${punctuation}${build_type} - "${GMXC_${language}FLAGS${punctuation}${build_type}} ${CMAKE_${language}_FLAGS${punctuation}${build_type}}" + "${CMAKE_${language}_FLAGS${punctuation}${build_type}} ${GMXC_${language}FLAGS${punctuation}${build_type}}" PARENT_SCOPE) endforeach() endforeach() diff --git a/cmake/gmxTestCompilerProblems.cmake b/cmake/gmxTestCompilerProblems.cmake index e534730013..a8cdb870df 100644 --- a/cmake/gmxTestCompilerProblems.cmake +++ b/cmake/gmxTestCompilerProblems.cmake @@ -80,8 +80,11 @@ macro(gmx_test_compiler_problems) message(WARNING "All tested PGI compiler versions (up to 12.9.0) generate binaries which produce incorrect results, or even fail to compile Gromacs. Highly recommended to use a different compiler. If you choose to use PGI, make sure to run the regressiontests.") endif() - if(CMAKE_COMPILER_IS_GNUCC AND WIN32 AND (GMX_SIMD STREQUAL "AVX_256" OR GMX_SIMD STREQUAL "AVX2_256")) - message(WARNING "GCC on Windows with AVX crashes. Choose SSE4_1 or a different compiler.") # GCC bug 49001. + if(CMAKE_COMPILER_IS_GNUCC AND + (CMAKE_C_COMPILER_VERSION VERSION_LESS "4.9.0" OR CMAKE_SIZEOF_VOID_P EQUAL 8) + AND (WIN32 OR CYGWIN) + AND GMX_SIMD MATCHES "AVX" AND NOT GMX_SIMD STREQUAL AVX_128_FMA) + message(WARNING "GCC on Windows (GCC older than 4.9 or any version when compiling for 64bit) with AVX (other than AVX_128_FMA) crashes. Choose a different GMX_SIMD or a different compiler.") # GCC bug 49001, 54412. endif() if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND WIN32 AND NOT CYGWIN) diff --git a/src/gromacs/gmxana/gmx_tune_pme.c b/src/gromacs/gmxana/gmx_tune_pme.c index b9c2af3e71..c4a7e6e021 100644 --- a/src/gromacs/gmxana/gmx_tune_pme.c +++ b/src/gromacs/gmxana/gmx_tune_pme.c @@ -1308,9 +1308,12 @@ static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp, { /* To prevent confusion, do not again issue a gmx_fatal here since we already * get the error message from mdrun itself */ - sprintf(msg, "Cannot run the benchmark simulations! Please check the error message of\n" + sprintf(msg, + "Cannot run the first benchmark simulation! Please check the error message of\n" "mdrun for the source of the problem. Did you provide a command line\n" - "argument that neither g_tune_pme nor mdrun understands? Offending command:\n" + "argument that neither gmx tune_pme nor mdrun understands? If you're\n" + "sure your command line should work, you can bypass this check with \n" + "gmx tune_pme -nocheck. The failing command was:\n" "\n%s\n\n", command); fprintf(stderr, "%s", msg); @@ -1341,7 +1344,7 @@ static void do_the_tests( int npme_fixed, /* If >= -1, test fixed number of PME * nodes only */ const char *npmevalues_opt, /* Which -npme values should be tested */ - t_perf **perfdata, /* Here the performace data is stored */ + t_perf **perfdata, /* Here the performance data is stored */ int *pmeentries, /* Entries in the nPMEnodes list */ int repeats, /* Repeat each test this often */ int nnodes, /* Total number of nodes = nPP + nPME */ @@ -1354,7 +1357,8 @@ static void do_the_tests( const t_filenm *fnm, /* List of filenames from command line */ int nfile, /* Number of files specified on the cmdl. */ int presteps, /* DLB equilibration steps, is checked */ - gmx_int64_t cpt_steps) /* Time step counter in the checkpoint */ + gmx_int64_t cpt_steps, /* Time step counter in the checkpoint */ + gmx_bool bCheck) /* Check whether benchmark mdrun works */ { int i, nr, k, ret, count = 0, totaltests; int *nPMEnodes = NULL; @@ -1454,8 +1458,11 @@ static void do_the_tests( cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench); /* To prevent that all benchmarks fail due to a show-stopper argument - * on the mdrun command line, we make a quick check first */ - if (bFirst) + * on the mdrun command line, we make a quick check first. + * This check can be turned off in cases where the automatically chosen + * number of PME-only ranks leads to a number of PP ranks for which no + * decomposition can be found (e.g. for large prime numbers) */ + if (bFirst && bCheck) { make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile); } @@ -2016,6 +2023,9 @@ int gmx_tune_pme(int argc, char *argv[]) "need to provide a machine- or hostfile. This can also be passed", "via the MPIRUN variable, e.g.[PAR]", "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]", + "Before doing the actual benchmark runs, [THISMODULE] will do a quick", + "check whether mdrun works as expected with the provided parallel settings", + "if the [TT]-check[tt] option is activated (the default).", "Please call [THISMODULE] with the normal options you would pass to", "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the", "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]", @@ -2025,7 +2035,7 @@ int gmx_tune_pme(int argc, char *argv[]) "written with enlarged cutoffs and smaller Fourier grids respectively.", "Typically, the first test (number 0) will be with the settings from the input", "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff", - "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ", + "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ", "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ", "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier " "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1", @@ -2035,7 +2045,7 @@ int gmx_tune_pme(int argc, char *argv[]) "MD systems. The dynamic load balancing needs about 100 time steps", "to adapt to local load imbalances, therefore the time step counters", "are by default reset after 100 steps. For large systems (>1M atoms), as well as ", - "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.", + "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher value.", "From the 'DD' load imbalance entries in the md.log output file you", "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]" "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]", @@ -2162,6 +2172,7 @@ int gmx_tune_pme(int argc, char *argv[]) gmx_bool bKeepAndNumCPT = FALSE; gmx_bool bResetCountersHalfWay = FALSE; gmx_bool bBenchmark = TRUE; + gmx_bool bCheck = TRUE; output_env_t oenv = NULL; @@ -2205,6 +2216,8 @@ int gmx_tune_pme(int argc, char *argv[]) "Launch the real simulation after optimization" }, { "-bench", FALSE, etBOOL, {&bBenchmark}, "Run the benchmarks or just create the input [TT].tpr[tt] files?" }, + { "-check", FALSE, etBOOL, {&bCheck}, + "Before the benchmark runs, check whether mdrun works in parallel" }, /******************/ /* mdrun options: */ /******************/ @@ -2457,7 +2470,7 @@ int gmx_tune_pme(int argc, char *argv[]) { do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries, repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun, - cmd_args_bench, fnm, NFILE, presteps, cpt_steps); + cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck); fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0); diff --git a/src/gromacs/gmxlib/calcgrid.c b/src/gromacs/gmxlib/calcgrid.c index 1de1f57efe..7bbee76f12 100644 --- a/src/gromacs/gmxlib/calcgrid.c +++ b/src/gromacs/gmxlib/calcgrid.c @@ -136,7 +136,7 @@ real calc_grid(FILE *fp, matrix box, real gr_sp, /* Determine how many pre-factors of 2 we need */ fac2 = 1; i = g_baseNR - 1; - while (fac2*grid_base[i-1] < nmin) + while (fac2*grid_base[i] < nmin) { fac2 *= 2; } diff --git a/src/gromacs/gmxpreprocess/toppush.c b/src/gromacs/gmxpreprocess/toppush.c index 161940b72e..8272b7787e 100644 --- a/src/gromacs/gmxpreprocess/toppush.c +++ b/src/gromacs/gmxpreprocess/toppush.c @@ -599,9 +599,11 @@ static void push_bondtype(t_params * bt, { sprintf(errbuf, "Overriding %s parameters.%s", interaction_function[ftype].longname, - (ftype == F_PDIHS) ? "\nUse dihedraltype 4 to allow several multiplicity terms." : ""); + (ftype == F_PDIHS) ? + "\nUse dihedraltype 9 to allow several multiplicity terms. Only consecutive lines are combined. Non-consective lines overwrite each other." + : ""); warning(wi, errbuf); - fprintf(stderr, " old:"); + fprintf(stderr, " old: "); for (j = 0; (j < nrfp); j++) { fprintf(stderr, " %g", bt->param[i].c[j]); diff --git a/src/gromacs/legacyheaders/domdec.h b/src/gromacs/legacyheaders/domdec.h index 13b55711e8..c44c7bf840 100644 --- a/src/gromacs/legacyheaders/domdec.h +++ b/src/gromacs/legacyheaders/domdec.h @@ -139,6 +139,12 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr); * possible after subsequently setting a shorter cut-off with change_dd_cutoff. */ +gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd); +/* Return if the DLB lock is set */ + +void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue); +/* Set a lock such that with DLB=auto DLB can (not) get turned on */ + void dd_setup_dlb_resource_sharing(t_commrec *cr, const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt); diff --git a/src/gromacs/listed-forces/bonded.cpp b/src/gromacs/listed-forces/bonded.cpp index b1ded380a4..dfb846d9e7 100644 --- a/src/gromacs/listed-forces/bonded.cpp +++ b/src/gromacs/listed-forces/bonded.cpp @@ -2117,6 +2117,152 @@ pdihs_noener_simd(int nbonds, } } +/* This is mostly a copy of pdihs_noener_simd above, but with using + * the RB potential instead of a harmonic potential. + * This function can replace rbdihs() when no energy and virial are needed. + */ +static void +rbdihs_noener_simd(int nbonds, + const t_iatom forceatoms[], const t_iparams forceparams[], + const rvec x[], rvec f[], + const t_pbc *pbc, const t_graph gmx_unused *g, + real gmx_unused lambda, + const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd, + int gmx_unused *global_atom_index) +{ + const int nfa1 = 5; + int i, iu, s, j; + int type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH]; + real dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr; + real buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf; + real *parm, *p, *q; + + gmx_simd_real_t phi_S; + gmx_simd_real_t ddphi_S, cosfac_S; + gmx_simd_real_t mx_S, my_S, mz_S; + gmx_simd_real_t nx_S, ny_S, nz_S; + gmx_simd_real_t nrkj_m2_S, nrkj_n2_S; + gmx_simd_real_t parm_S, c_S; + gmx_simd_real_t sin_S, cos_S; + gmx_simd_real_t sf_i_S, msf_l_S; + pbc_simd_t pbc_simd; + + gmx_simd_real_t pi_S = gmx_simd_set1_r(M_PI); + gmx_simd_real_t one_S = gmx_simd_set1_r(1.0); + + /* Ensure SIMD register alignment */ + dr = gmx_simd_align_r(dr_array); + buf = gmx_simd_align_r(buf_array); + + /* Extract aligned pointer for parameters and variables */ + parm = buf; + p = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH; + q = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH; + + set_pbc_simd(pbc, &pbc_simd); + + /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */ + for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1) + { + /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals. + * iu indexes into forceatoms, we should not let iu go beyond nbonds. + */ + iu = i; + for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++) + { + type = forceatoms[iu]; + ai[s] = forceatoms[iu+1]; + aj[s] = forceatoms[iu+2]; + ak[s] = forceatoms[iu+3]; + al[s] = forceatoms[iu+4]; + + /* We don't need the first parameter, since that's a constant + * which only affects the energies, not the forces. + */ + for (j = 1; j < NR_RBDIHS; j++) + { + parm[j*GMX_SIMD_REAL_WIDTH + s] = + forceparams[type].rbdihs.rbcA[j]; + } + + /* At the end fill the arrays with identical entries */ + if (iu + nfa1 < nbonds) + { + iu += nfa1; + } + } + + /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */ + dih_angle_simd(x, ai, aj, ak, al, &pbc_simd, + dr, + &phi_S, + &mx_S, &my_S, &mz_S, + &nx_S, &ny_S, &nz_S, + &nrkj_m2_S, + &nrkj_n2_S, + p, q); + + /* Change to polymer convention */ + phi_S = gmx_simd_sub_r(phi_S, pi_S); + + gmx_simd_sincos_r(phi_S, &sin_S, &cos_S); + + ddphi_S = gmx_simd_setzero_r(); + c_S = one_S; + cosfac_S = one_S; + for (j = 1; j < NR_RBDIHS; j++) + { + parm_S = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH); + ddphi_S = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S); + cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S); + c_S = gmx_simd_add_r(c_S, one_S); + } + + /* Note that here we do not use the minus sign which is present + * in the normal RB code. This is corrected for through (m)sf below. + */ + ddphi_S = gmx_simd_mul_r(ddphi_S, sin_S); + + sf_i_S = gmx_simd_mul_r(ddphi_S, nrkj_m2_S); + msf_l_S = gmx_simd_mul_r(ddphi_S, nrkj_n2_S); + + /* After this m?_S will contain f[i] */ + mx_S = gmx_simd_mul_r(sf_i_S, mx_S); + my_S = gmx_simd_mul_r(sf_i_S, my_S); + mz_S = gmx_simd_mul_r(sf_i_S, mz_S); + + /* After this m?_S will contain -f[l] */ + nx_S = gmx_simd_mul_r(msf_l_S, nx_S); + ny_S = gmx_simd_mul_r(msf_l_S, ny_S); + nz_S = gmx_simd_mul_r(msf_l_S, nz_S); + + gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S); + gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S); + gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S); + gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S); + gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S); + gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S); + + iu = i; + s = 0; + do + { + do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s], + p[s], q[s], + dr[ XX *GMX_SIMD_REAL_WIDTH+s], + dr[ YY *GMX_SIMD_REAL_WIDTH+s], + dr[ ZZ *GMX_SIMD_REAL_WIDTH+s], + dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s], + dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s], + dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s], + f); + s++; + iu += nfa1; + } + while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds); + } +} + #endif /* GMX_SIMD_HAVE_REAL */ @@ -4405,6 +4551,19 @@ static real calc_one_bond(int thread, global_atom_index); v = 0; } +#ifdef GMX_SIMD_HAVE_REAL + else if (ftype == F_RBDIHS && + !bCalcEnerVir && fr->efep == efepNO) + { + /* No energies, shift forces, dvdl */ + rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0, + idef->iparams, + (const rvec*)x, f, + pbc, g, lambda[efptFTYPE], md, fcd, + global_atom_index); + v = 0; + } +#endif else { v = interaction_function[ftype].ifunc(nbn, iatoms+nb0, diff --git a/src/gromacs/mdlib/domdec.cpp b/src/gromacs/mdlib/domdec.cpp index 13934d5fa6..06f2602916 100644 --- a/src/gromacs/mdlib/domdec.cpp +++ b/src/gromacs/mdlib/domdec.cpp @@ -275,6 +275,8 @@ typedef struct gmx_domdec_comm /* The DLB option */ int eDLB; + /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */ + gmx_bool bDLB_locked; /* Are we actually using DLB? */ gmx_bool bDynLoadBal; @@ -391,9 +393,9 @@ typedef struct gmx_domdec_comm int eFlop; double flop; int flop_n; - /* Have often have did we have load measurements */ + /* How many times have did we have load measurements */ int n_load_have; - /* Have often have we collected the load measurements */ + /* How many times have we collected the load measurements */ int n_load_collect; /* Statistics */ @@ -3465,7 +3467,7 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd, cell_size[i] = 1.0/ncd; } } - else if (dd_load_count(comm)) + else if (dd_load_count(comm) > 0) { load_aver = comm->load[d].sum_m/ncd; change_max = 0; @@ -4336,7 +4338,7 @@ static void clear_and_mark_ind(int ncg, int *move, static void print_cg_move(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step, int cg, int dim, int dir, - gmx_bool bHaveLimitdAndCMOld, real limitd, + gmx_bool bHaveCgcmOld, real limitd, rvec cm_old, rvec cm_new, real pos_d) { gmx_domdec_comm_t *comm; @@ -4345,19 +4347,22 @@ static void print_cg_move(FILE *fplog, comm = dd->comm; fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf)); - if (bHaveLimitdAndCMOld) + if (limitd > 0) { - fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n", + fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n", + dd->comm->bCGs ? "The charge group starting at atom" : "Atom", ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim)); } else { - fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n", + /* We don't have a limiting distance available: don't print it */ + fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n", + dd->comm->bCGs ? "The charge group starting at atom" : "Atom", ddglatnr(dd, dd->cgindex[cg]), dim2char(dim)); } fprintf(fplog, "distance out of cell %f\n", dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]); - if (bHaveLimitdAndCMOld) + if (bHaveCgcmOld) { fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n", cm_old[XX], cm_old[YY], cm_old[ZZ]); @@ -4375,19 +4380,20 @@ static void print_cg_move(FILE *fplog, static void cg_move_error(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step, int cg, int dim, int dir, - gmx_bool bHaveLimitdAndCMOld, real limitd, + gmx_bool bHaveCgcmOld, real limitd, rvec cm_old, rvec cm_new, real pos_d) { if (fplog) { print_cg_move(fplog, dd, step, cg, dim, dir, - bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d); + bHaveCgcmOld, limitd, cm_old, cm_new, pos_d); } print_cg_move(stderr, dd, step, cg, dim, dir, - bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d); + bHaveCgcmOld, limitd, cm_old, cm_new, pos_d); gmx_fatal(FARGS, - "A charge group moved too far between two domain decomposition steps\n" - "This usually means that your system is not well equilibrated"); + "%s moved too far between two domain decomposition steps\n" + "This usually means that your system is not well equilibrated", + dd->comm->bCGs ? "A charge group" : "An atom"); } static void rotate_state_atom(t_state *state, int a) @@ -4509,7 +4515,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step, { if (pos_d >= limit1[d]) { - cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d], + cg_move_error(fplog, dd, step, cg, d, 1, + cg_cm != state->x, limitd[d], cg_cm[cg], cm_new, pos_d); } dev[d] = 1; @@ -4535,7 +4542,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step, { if (pos_d < limit0[d]) { - cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d], + cg_move_error(fplog, dd, step, cg, d, -1, + cg_cm != state->x, limitd[d], cg_cm[cg], cm_new, pos_d); } dev[d] = -1; @@ -4949,7 +4957,7 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step, { cg_move_error(fplog, dd, step, cg, dim, (flag & DD_FLAG_FW(d)) ? 1 : 0, - FALSE, 0, + fr->cutoff_scheme == ecutsGROUP, 0, comm->vbuf.v[buf_pos], comm->vbuf.v[buf_pos], comm->vbuf.v[buf_pos][dim]); @@ -6673,7 +6681,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, /* Initialize to GPU share count to 0, might change later */ comm->nrank_gpu_shared = 0; - comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); + comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); + comm->bDLB_locked = FALSE; comm->bDynLoadBal = (comm->eDLB == edlbYES); if (fplog) @@ -6739,6 +6748,13 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, comm->cellsize_limit = 0; comm->bBondComm = FALSE; + /* Atoms should be able to move by up to half the list buffer size (if > 0) + * within nstlist steps. Since boundaries are allowed to displace by half + * a cell size, DD cells should be at least the size of the list buffer. + */ + comm->cellsize_limit = std::max(comm->cellsize_limit, + ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb)); + if (comm->bInterCGBondeds) { if (comm_distance_min > 0) @@ -7564,6 +7580,20 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr) comm->PMELoadBal_max_cutoff = comm->cutoff; } +gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd) +{ + return dd->comm->bDLB_locked; +} + +void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue) +{ + /* We can only lock the DLB when it is set to auto, otherwise don't lock */ + if (dd->comm->eDLB == edlbAUTO) + { + dd->comm->bDLB_locked = bValue; + } +} + static void merge_cg_buffers(int ncell, gmx_domdec_comm_dim_t *cd, int pulse, int *ncg_cell, @@ -9324,17 +9354,17 @@ void dd_partition_system(FILE *fplog, } /* Check if we have recorded loads on the nodes */ - if (comm->bRecordLoad && dd_load_count(comm)) + if (comm->bRecordLoad && dd_load_count(comm) > 0) { - if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal) + if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd)) { /* Check if we should use DLB at the second partitioning * and every 100 partitionings, * so the extra communication cost is negligible. */ - n = std::max(100, nstglobalcomm); + const int nddp_chk_dlb = 100; bCheckDLB = (comm->n_load_collect == 0 || - comm->n_load_have % n == n-1); + comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1); } else { @@ -9372,8 +9402,26 @@ void dd_partition_system(FILE *fplog, /* Since the timings are node dependent, the master decides */ if (DDMASTER(dd)) { - bTurnOnDLB = - (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON); + /* Here we check if the max PME rank load is more than 0.98 + * the max PP force load. If so, PP DLB will not help, + * since we are (almost) limited by PME. Furthermore, + * DLB will cause a significant extra x/f redistribution + * cost on the PME ranks, which will then surely result + * in lower total performance. + * This check might be fragile, since one measurement + * below 0.98 (although only done once every 100 DD part.) + * could turn on DLB for the rest of the run. + */ + if (cr->npmenodes > 0 && + dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON) + { + bTurnOnDLB = FALSE; + } + else + { + bTurnOnDLB = + (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON); + } if (debug) { fprintf(debug, "step %s, imb loss %f\n", diff --git a/src/gromacs/mdlib/force.c b/src/gromacs/mdlib/force.c index f98b073fee..4313d3e75e 100644 --- a/src/gromacs/mdlib/force.c +++ b/src/gromacs/mdlib/force.c @@ -118,9 +118,11 @@ static void reduce_thread_forces(int n, rvec *f, int nthreads, f_thread_t *f_t) { int t, i; + int nthreads_loop gmx_unused; /* This reduction can run over any number of threads */ -#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static) + nthreads_loop = gmx_omp_nthreads_get(emntBonded); +#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static) for (i = 0; i < n; i++) { for (t = 1; t < nthreads; t++) diff --git a/src/gromacs/mdlib/mdatom.c b/src/gromacs/mdlib/mdatom.c index 01c5c55bd6..d1722dedf1 100644 --- a/src/gromacs/mdlib/mdatom.c +++ b/src/gromacs/mdlib/mdatom.c @@ -118,6 +118,7 @@ void atoms2md(gmx_mtop_t *mtop, t_inputrec *ir, t_grpopts *opts; gmx_groups_t *groups; gmx_molblock_t *molblock; + int nthreads gmx_unused; bLJPME = EVDW_PME(ir->vdwtype); @@ -230,7 +231,8 @@ void atoms2md(gmx_mtop_t *mtop, t_inputrec *ir, alook = gmx_mtop_atomlookup_init(mtop); -#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static) + nthreads = gmx_omp_nthreads_get(emntDefault); +#pragma omp parallel for num_threads(nthreads) schedule(static) for (i = 0; i < md->nr; i++) { int g, ag, molb; diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c index 234bcbdd8f..5255d544d1 100644 --- a/src/gromacs/mdlib/minimize.c +++ b/src/gromacs/mdlib/minimize.c @@ -550,6 +550,7 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md, int start, end; rvec *x1, *x2; real dvdl_constr; + int nthreads gmx_unused; s1 = &ems1->s; s2 = &ems2->s; @@ -587,7 +588,8 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md, x1 = s1->x; x2 = s2->x; -#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate)) + nthreads = gmx_omp_nthreads_get(emntUpdate); +#pragma omp parallel num_threads(nthreads) { int gf, i, m; diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 799425edab..c024f7b02c 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -212,7 +212,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int eeltype, nbnxn_cu_kfunc_ptr_t res; assert(eeltype < eelCuNR); - assert(evdwtype < eelCuNR); + assert(evdwtype < evdwCuNR); if (bDoEne) { diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh index c468306899..3b37869e6d 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh @@ -157,7 +157,7 @@ /* Analytical Ewald interaction kernels with twin-range cut-off */ #define EL_EWALD_ANA -#define LJ_CUTOFF_CHECK +#define VDW_CUTOFF_CHECK /* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__ @@ -189,7 +189,7 @@ #undef NB_KERNEL_FUNC_NAME #undef EL_EWALD_ANA -#undef LJ_CUTOFF_CHECK +#undef VDW_CUTOFF_CHECK /* Tabulated Ewald interaction kernels */ @@ -229,7 +229,7 @@ /* Tabulated Ewald interaction kernels with twin-range cut-off */ #define EL_EWALD_TAB -#define LJ_CUTOFF_CHECK +#define VDW_CUTOFF_CHECK /* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__ @@ -261,4 +261,4 @@ #undef NB_KERNEL_FUNC_NAME #undef EL_EWALD_TAB -#undef LJ_CUTOFF_CHECK +#undef VDW_CUTOFF_CHECK diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre index 9432b2f311..5e410cdecc 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_file_generator/nbnxn_kernel_simd_template.c.pre @@ -139,6 +139,7 @@ void nbnxn_pairlist_t **nbl; int coulkt, vdwkt = 0; int nb; + int nthreads gmx_unused; nnbl = nbl_list->nnbl; nbl = nbl_list->nbl; @@ -210,7 +211,8 @@ void gmx_incons("Unsupported VdW interaction type"); }} -#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded)) + nthreads = gmx_omp_nthreads_get(emntNonbonded); +#pragma omp parallel for schedule(static) num_threads(nthreads) for (nb = 0; nb < nnbl; nb++) {{ nbnxn_atomdata_output_t *out; diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c index 3db03bbec5..4cad2a2f56 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c @@ -185,6 +185,7 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list, int coult; int vdwt; int nb; + int nthreads gmx_unused; nnbl = nbl_list->nnbl; nbl = nbl_list->nbl; @@ -242,7 +243,8 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list, gmx_incons("Unsupported vdwtype in nbnxn reference kernel"); } -#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded)) + nthreads = gmx_omp_nthreads_get(emntNonbonded); +#pragma omp parallel for schedule(static) num_threads(nthreads) for (nb = 0; nb < nnbl; nb++) { nbnxn_atomdata_output_t *out; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c index ed1456873b..1d8567b206 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c @@ -274,6 +274,7 @@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t gmx_unused *nbl_list, nbnxn_pairlist_t **nbl; int coulkt, vdwkt = 0; int nb; + int nthreads gmx_unused; nnbl = nbl_list->nnbl; nbl = nbl_list->nbl; @@ -345,7 +346,8 @@ nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t gmx_unused *nbl_list, gmx_incons("Unsupported VdW interaction type"); } -#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded)) + nthreads = gmx_omp_nthreads_get(emntNonbonded); +#pragma omp parallel for schedule(static) num_threads(nthreads) for (nb = 0; nb < nnbl; nb++) { nbnxn_atomdata_output_t *out; diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c index d17ba1e7bd..3531299efd 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c +++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c @@ -273,6 +273,7 @@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t gmx_unused *nbl_list, nbnxn_pairlist_t **nbl; int coulkt, vdwkt = 0; int nb; + int nthreads gmx_unused; nnbl = nbl_list->nnbl; nbl = nbl_list->nbl; @@ -344,7 +345,8 @@ nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t gmx_unused *nbl_list, gmx_incons("Unsupported VdW interaction type"); } -#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded)) + nthreads = gmx_omp_nthreads_get(emntNonbonded); +#pragma omp parallel for schedule(static) num_threads(nthreads) for (nb = 0; nb < nnbl; nb++) { nbnxn_atomdata_output_t *out; diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c index f13de8c7db..56e616967c 100644 --- a/src/gromacs/mdlib/nbnxn_search.c +++ b/src/gromacs/mdlib/nbnxn_search.c @@ -1931,6 +1931,7 @@ void nbnxn_grid_add_simple(nbnxn_search_t nbs, float *bbcz; nbnxn_bb_t *bb; int ncd, sc; + int nthreads gmx_unused; grid = &nbs->grid[0]; @@ -1957,7 +1958,8 @@ void nbnxn_grid_add_simple(nbnxn_search_t nbs, bbcz = grid->bbcz_simple; bb = grid->bb_simple; -#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static) + nthreads = gmx_omp_nthreads_get(emntPairsearch); +#pragma omp parallel for num_threads(nthreads) schedule(static) for (sc = 0; sc < grid->nc; sc++) { int c, tx, na; @@ -4473,6 +4475,7 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl, { int nsci, ncj4, nexcl; int n, i; + int nthreads gmx_unused; if (nblc->bSimple) { @@ -4513,7 +4516,8 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl, /* Each thread should copy its own data to the combined arrays, * as otherwise data will go back and forth between different caches. */ -#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static) + nthreads = gmx_omp_nthreads_get(emntPairsearch); +#pragma omp parallel for num_threads(nthreads) schedule(static) for (n = 0; n < nnbl; n++) { int sci_offset; diff --git a/src/gromacs/mdlib/ns.c b/src/gromacs/mdlib/ns.c index 9145929c47..3a1396cd40 100644 --- a/src/gromacs/mdlib/ns.c +++ b/src/gromacs/mdlib/ns.c @@ -427,6 +427,14 @@ static gmx_inline void close_i_nblist(t_nblist *nlist) nlist->jindex[nri+1] = nlist->nrj; len = nlist->nrj - nlist->jindex[nri]; + /* If there are no j-particles we have to reduce the + * number of i-particles again, to prevent errors in the + * kernel functions. + */ + if ((len == 0) && (nlist->nri > 0)) + { + nlist->nri--; + } } } diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c index e6686abd02..b35dd827ac 100644 --- a/src/gromacs/mdlib/sim_util.c +++ b/src/gromacs/mdlib/sim_util.c @@ -1399,6 +1399,16 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, if (bDoForces && DOMAINDECOMP(cr)) { + if (bUseGPU) + { + /* We are done with the CPU compute, but the GPU local non-bonded + * kernel can still be running while we communicate the forces. + * We start a counter here, so we can, hopefully, time the rest + * of the GPU kernel execution and data transfer. + */ + wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST); + } + /* Communicate the forces */ wallcycle_start(wcycle, ewcMOVEF); dd_move_f(cr->dd, f, fr->fshift); @@ -1429,13 +1439,44 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, /* wait for local forces (or calculate in emulation mode) */ if (bUseGPU) { + float cycles_tmp, cycles_wait_est; + const float cuda_api_overhead_margin = 50000.0f; /* cycles */ + wallcycle_start(wcycle, ewcWAIT_GPU_NB_L); nbnxn_cuda_wait_gpu(nbv->cu_nbv, nbv->grp[eintLocal].nbat, flags, eatLocal, enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR], fr->fshift); - cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); + cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); + + if (bDoForces && DOMAINDECOMP(cr)) + { + cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST); + + if (cycles_tmp < cuda_api_overhead_margin) + { + /* We measured few cycles, it could be that the kernel + * and transfer finished earlier and there was no actual + * wait time, only API call overhead. + * Then the actual time could be anywhere between 0 and + * cycles_wait_est. As a compromise, we use half the time. + */ + cycles_wait_est *= 0.5f; + } + } + else + { + /* No force communication so we actually timed the wait */ + cycles_wait_est = cycles_tmp; + } + /* Even though this is after dd_move_f, the actual task we are + * waiting for runs asynchronously with dd_move_f and we usually + * have nothing to balance it with, so we can and should add + * the time to the force time for load balancing. + */ + cycles_force += cycles_wait_est; + cycles_wait_gpu += cycles_wait_est; /* now clear the GPU outputs while we finish the step on the CPU */ diff --git a/src/gromacs/mdlib/update.c b/src/gromacs/mdlib/update.c index 3be84a8a93..373212ca8a 100644 --- a/src/gromacs/mdlib/update.c +++ b/src/gromacs/mdlib/update.c @@ -1767,7 +1767,9 @@ void update_constraints(FILE *fplog, } else { -#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static) + nth = gmx_omp_nthreads_get(emntUpdate); + +#pragma omp parallel for num_threads(nth) schedule(static) for (i = start; i < nrend; i++) { copy_rvec(upd->xp[i], state->x[i]); diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c index 27eb275cd0..3b0d906636 100644 --- a/src/gromacs/timing/wallcycle.c +++ b/src/gromacs/timing/wallcycle.c @@ -99,7 +99,7 @@ static const char *wcn[ewcNR] = "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.", "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh", "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec", - "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.", + "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.", "Vsite spread", "COM pull force", "Write traj.", "Update", "Constraints", "Comm. energies", "Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test" @@ -388,6 +388,12 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc) wcc = wc->wcc; + /* The GPU wait estimate counter is used for load balancing only + * and will mess up the total due to double counting: clear it. + */ + wcc[ewcWAIT_GPU_NB_L_EST].n = 0; + wcc[ewcWAIT_GPU_NB_L_EST].c = 0; + for (i = 0; i < ewcNR; i++) { if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME)) diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 104173598a..69c92cde72 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -54,7 +54,7 @@ enum { ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB, ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH, ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE, - ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS, + ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcWAIT_GPU_NB_L_EST, ewcNB_XF_BUF_OPS, ewcVSITESPREAD, ewcPULLPOT, ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD, ewcTEST, ewcNR diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp index 9864f8789f..0504150c00 100644 --- a/src/programs/mdrun/md.cpp +++ b/src/programs/mdrun/md.cpp @@ -1881,6 +1881,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], } dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning); + if (bPMETuneRunning && + use_GPU(fr->nbv) && DOMAINDECOMP(cr) && + !(cr->duty & DUTY_PME)) + { + /* Lock DLB=auto to off (does nothing when DLB=yes/no). + * With GPUs + separate PME ranks, we don't want DLB. + * This could happen when we scan coarse grids and + * it would then never be turned off again. + * This would hurt performance at the final, optimal + * grid spacing, where DLB almost never helps. + * Also, DLB can limit the cut-off for PME tuning. + */ + dd_dlb_set_lock(cr->dd, TRUE); + } + if (bPMETuneRunning || step_rel > ir->nstlist*50) { bPMETuneTry = FALSE; @@ -1911,6 +1926,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], { calc_enervirdiff(NULL, ir->eDispCorr, fr); } + + if (!bPMETuneRunning && + DOMAINDECOMP(cr) && + dd_dlb_is_locked(cr->dd)) + { + /* Unlock the DLB=auto, DLB is allowed to activate + * (but we don't expect it to activate in most cases). + */ + dd_dlb_set_lock(cr->dd, FALSE); + } } cycles_pmes = 0; } diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c index 5ba76e63e0..dd2cd1a2e9 100644 --- a/src/programs/mdrun/pme_loadbal.c +++ b/src/programs/mdrun/pme_loadbal.c @@ -265,6 +265,15 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb, while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok); set->rcut_coulomb = pme_lb->cut_spacing*sp; + if (set->rcut_coulomb < pme_lb->rcut_coulomb_start) + { + /* This is unlikely, but can happen when e.g. continuing from + * a checkpoint after equilibration where the box shrank a lot. + * We want to avoid rcoulomb getting smaller than rvdw + * and there might be more issues with decreasing rcoulomb. + */ + set->rcut_coulomb = pme_lb->rcut_coulomb_start; + } if (pme_lb->cutoff_scheme == ecutsVERLET) {