"List of configuration types"
FORCE)
endif()
-set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
+set(build_types_with_explicit_flags RELEASE DEBUG RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
# all tokens are separated by any mix of ',' commas, '=' equal signs
# and whitespace (space, tab)
#
+
+# Teach uncrustify about the GROMACS attribute aliases that we use
+# to hide compiler differences. This means that declarations like
+#
+# int i, j;
+# int nthreads gmx_unused;
+#
+# does not align i with gmx_unused.
+set ATTRIBUTE gmx_unused gmx_inline gmx_restrict
# be set up elsewhere and passed to this function, but it is
# inconvenient in CMake to pass more than one list, and such a
# list is only used here.
- foreach(build_type RELWITHDEBUGINFO RELWITHASSERT MINSIZEREL PROFILE)
+ foreach(build_type RELWITHDEBINFO RELWITHASSERT MINSIZEREL PROFILE)
set(GMXC_${language}FLAGS_${build_type} "${GMXC_${language}FLAGS_RELEASE}")
endforeach()
# Copy the flags that are only used by the real Release build
endif()
# Append to the variables for the given build type for
- # each language, in the parent scope.
+ # each language, in the parent scope. We add our new variables at the end, so
+ # compiler-specific choices are more likely to override default CMake choices.
+ # This is for instance useful for RelWithDebInfo builds, where we want to use the full
+ # set of our optimization flags detected in this file, rather than having -O2 override them.
set(CMAKE_${language}_FLAGS${punctuation}${build_type}
- "${GMXC_${language}FLAGS${punctuation}${build_type}} ${CMAKE_${language}_FLAGS${punctuation}${build_type}}"
+ "${CMAKE_${language}_FLAGS${punctuation}${build_type}} ${GMXC_${language}FLAGS${punctuation}${build_type}}"
PARENT_SCOPE)
endforeach()
endforeach()
message(WARNING "All tested PGI compiler versions (up to 12.9.0) generate binaries which produce incorrect results, or even fail to compile Gromacs. Highly recommended to use a different compiler. If you choose to use PGI, make sure to run the regressiontests.")
endif()
- if(CMAKE_COMPILER_IS_GNUCC AND WIN32 AND (GMX_SIMD STREQUAL "AVX_256" OR GMX_SIMD STREQUAL "AVX2_256"))
- message(WARNING "GCC on Windows with AVX crashes. Choose SSE4_1 or a different compiler.") # GCC bug 49001.
+ if(CMAKE_COMPILER_IS_GNUCC AND
+ (CMAKE_C_COMPILER_VERSION VERSION_LESS "4.9.0" OR CMAKE_SIZEOF_VOID_P EQUAL 8)
+ AND (WIN32 OR CYGWIN)
+ AND GMX_SIMD MATCHES "AVX" AND NOT GMX_SIMD STREQUAL AVX_128_FMA)
+ message(WARNING "GCC on Windows (GCC older than 4.9 or any version when compiling for 64bit) with AVX (other than AVX_128_FMA) crashes. Choose a different GMX_SIMD or a different compiler.") # GCC bug 49001, 54412.
endif()
if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND WIN32 AND NOT CYGWIN)
{
/* To prevent confusion, do not again issue a gmx_fatal here since we already
* get the error message from mdrun itself */
- sprintf(msg, "Cannot run the benchmark simulations! Please check the error message of\n"
+ sprintf(msg,
+ "Cannot run the first benchmark simulation! Please check the error message of\n"
"mdrun for the source of the problem. Did you provide a command line\n"
- "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
+ "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
+ "sure your command line should work, you can bypass this check with \n"
+ "gmx tune_pme -nocheck. The failing command was:\n"
"\n%s\n\n", command);
fprintf(stderr, "%s", msg);
int npme_fixed, /* If >= -1, test fixed number of PME
* nodes only */
const char *npmevalues_opt, /* Which -npme values should be tested */
- t_perf **perfdata, /* Here the performace data is stored */
+ t_perf **perfdata, /* Here the performance data is stored */
int *pmeentries, /* Entries in the nPMEnodes list */
int repeats, /* Repeat each test this often */
int nnodes, /* Total number of nodes = nPP + nPME */
const t_filenm *fnm, /* List of filenames from command line */
int nfile, /* Number of files specified on the cmdl. */
int presteps, /* DLB equilibration steps, is checked */
- gmx_int64_t cpt_steps) /* Time step counter in the checkpoint */
+ gmx_int64_t cpt_steps, /* Time step counter in the checkpoint */
+ gmx_bool bCheck) /* Check whether benchmark mdrun works */
{
int i, nr, k, ret, count = 0, totaltests;
int *nPMEnodes = NULL;
cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
/* To prevent that all benchmarks fail due to a show-stopper argument
- * on the mdrun command line, we make a quick check first */
- if (bFirst)
+ * on the mdrun command line, we make a quick check first.
+ * This check can be turned off in cases where the automatically chosen
+ * number of PME-only ranks leads to a number of PP ranks for which no
+ * decomposition can be found (e.g. for large prime numbers) */
+ if (bFirst && bCheck)
{
make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
}
"need to provide a machine- or hostfile. This can also be passed",
"via the MPIRUN variable, e.g.[PAR]",
"[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
+ "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
+ "check whether mdrun works as expected with the provided parallel settings",
+ "if the [TT]-check[tt] option is activated (the default).",
"Please call [THISMODULE] with the normal options you would pass to",
"[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
"tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
"written with enlarged cutoffs and smaller Fourier grids respectively.",
"Typically, the first test (number 0) will be with the settings from the input",
"[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
- "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
+ "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
"In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
"The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
"spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
"MD systems. The dynamic load balancing needs about 100 time steps",
"to adapt to local load imbalances, therefore the time step counters",
"are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
- "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
+ "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
"From the 'DD' load imbalance entries in the md.log output file you",
"can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
"[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
gmx_bool bKeepAndNumCPT = FALSE;
gmx_bool bResetCountersHalfWay = FALSE;
gmx_bool bBenchmark = TRUE;
+ gmx_bool bCheck = TRUE;
output_env_t oenv = NULL;
"Launch the real simulation after optimization" },
{ "-bench", FALSE, etBOOL, {&bBenchmark},
"Run the benchmarks or just create the input [TT].tpr[tt] files?" },
+ { "-check", FALSE, etBOOL, {&bCheck},
+ "Before the benchmark runs, check whether mdrun works in parallel" },
/******************/
/* mdrun options: */
/******************/
{
do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
- cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
+ cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck);
fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
/* Determine how many pre-factors of 2 we need */
fac2 = 1;
i = g_baseNR - 1;
- while (fac2*grid_base[i-1] < nmin)
+ while (fac2*grid_base[i] < nmin)
{
fac2 *= 2;
}
{
sprintf(errbuf, "Overriding %s parameters.%s",
interaction_function[ftype].longname,
- (ftype == F_PDIHS) ? "\nUse dihedraltype 4 to allow several multiplicity terms." : "");
+ (ftype == F_PDIHS) ?
+ "\nUse dihedraltype 9 to allow several multiplicity terms. Only consecutive lines are combined. Non-consective lines overwrite each other."
+ : "");
warning(wi, errbuf);
- fprintf(stderr, " old:");
+ fprintf(stderr, " old: ");
for (j = 0; (j < nrfp); j++)
{
fprintf(stderr, " %g", bt->param[i].c[j]);
* possible after subsequently setting a shorter cut-off with change_dd_cutoff.
*/
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd);
+/* Return if the DLB lock is set */
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue);
+/* Set a lock such that with DLB=auto DLB can (not) get turned on */
+
void dd_setup_dlb_resource_sharing(t_commrec *cr,
const gmx_hw_info_t *hwinfo,
const gmx_hw_opt_t *hw_opt);
}
}
+/* This is mostly a copy of pdihs_noener_simd above, but with using
+ * the RB potential instead of a harmonic potential.
+ * This function can replace rbdihs() when no energy and virial are needed.
+ */
+static void
+rbdihs_noener_simd(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec f[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const int nfa1 = 5;
+ int i, iu, s, j;
+ int type, ai[GMX_SIMD_REAL_WIDTH], aj[GMX_SIMD_REAL_WIDTH], ak[GMX_SIMD_REAL_WIDTH], al[GMX_SIMD_REAL_WIDTH];
+ real dr_array[3*DIM*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *dr;
+ real buf_array[(NR_RBDIHS + 4)*GMX_SIMD_REAL_WIDTH+GMX_SIMD_REAL_WIDTH], *buf;
+ real *parm, *p, *q;
+
+ gmx_simd_real_t phi_S;
+ gmx_simd_real_t ddphi_S, cosfac_S;
+ gmx_simd_real_t mx_S, my_S, mz_S;
+ gmx_simd_real_t nx_S, ny_S, nz_S;
+ gmx_simd_real_t nrkj_m2_S, nrkj_n2_S;
+ gmx_simd_real_t parm_S, c_S;
+ gmx_simd_real_t sin_S, cos_S;
+ gmx_simd_real_t sf_i_S, msf_l_S;
+ pbc_simd_t pbc_simd;
+
+ gmx_simd_real_t pi_S = gmx_simd_set1_r(M_PI);
+ gmx_simd_real_t one_S = gmx_simd_set1_r(1.0);
+
+ /* Ensure SIMD register alignment */
+ dr = gmx_simd_align_r(dr_array);
+ buf = gmx_simd_align_r(buf_array);
+
+ /* Extract aligned pointer for parameters and variables */
+ parm = buf;
+ p = buf + (NR_RBDIHS + 0)*GMX_SIMD_REAL_WIDTH;
+ q = buf + (NR_RBDIHS + 1)*GMX_SIMD_REAL_WIDTH;
+
+ set_pbc_simd(pbc, &pbc_simd);
+
+ /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+ for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+ {
+ /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+ * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ */
+ iu = i;
+ for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ {
+ type = forceatoms[iu];
+ ai[s] = forceatoms[iu+1];
+ aj[s] = forceatoms[iu+2];
+ ak[s] = forceatoms[iu+3];
+ al[s] = forceatoms[iu+4];
+
+ /* We don't need the first parameter, since that's a constant
+ * which only affects the energies, not the forces.
+ */
+ for (j = 1; j < NR_RBDIHS; j++)
+ {
+ parm[j*GMX_SIMD_REAL_WIDTH + s] =
+ forceparams[type].rbdihs.rbcA[j];
+ }
+
+ /* At the end fill the arrays with identical entries */
+ if (iu + nfa1 < nbonds)
+ {
+ iu += nfa1;
+ }
+ }
+
+ /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+ dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
+ dr,
+ &phi_S,
+ &mx_S, &my_S, &mz_S,
+ &nx_S, &ny_S, &nz_S,
+ &nrkj_m2_S,
+ &nrkj_n2_S,
+ p, q);
+
+ /* Change to polymer convention */
+ phi_S = gmx_simd_sub_r(phi_S, pi_S);
+
+ gmx_simd_sincos_r(phi_S, &sin_S, &cos_S);
+
+ ddphi_S = gmx_simd_setzero_r();
+ c_S = one_S;
+ cosfac_S = one_S;
+ for (j = 1; j < NR_RBDIHS; j++)
+ {
+ parm_S = gmx_simd_load_r(parm + j*GMX_SIMD_REAL_WIDTH);
+ ddphi_S = gmx_simd_fmadd_r(gmx_simd_mul_r(c_S, parm_S), cosfac_S, ddphi_S);
+ cosfac_S = gmx_simd_mul_r(cosfac_S, cos_S);
+ c_S = gmx_simd_add_r(c_S, one_S);
+ }
+
+ /* Note that here we do not use the minus sign which is present
+ * in the normal RB code. This is corrected for through (m)sf below.
+ */
+ ddphi_S = gmx_simd_mul_r(ddphi_S, sin_S);
+
+ sf_i_S = gmx_simd_mul_r(ddphi_S, nrkj_m2_S);
+ msf_l_S = gmx_simd_mul_r(ddphi_S, nrkj_n2_S);
+
+ /* After this m?_S will contain f[i] */
+ mx_S = gmx_simd_mul_r(sf_i_S, mx_S);
+ my_S = gmx_simd_mul_r(sf_i_S, my_S);
+ mz_S = gmx_simd_mul_r(sf_i_S, mz_S);
+
+ /* After this m?_S will contain -f[l] */
+ nx_S = gmx_simd_mul_r(msf_l_S, nx_S);
+ ny_S = gmx_simd_mul_r(msf_l_S, ny_S);
+ nz_S = gmx_simd_mul_r(msf_l_S, nz_S);
+
+ gmx_simd_store_r(dr + 0*GMX_SIMD_REAL_WIDTH, mx_S);
+ gmx_simd_store_r(dr + 1*GMX_SIMD_REAL_WIDTH, my_S);
+ gmx_simd_store_r(dr + 2*GMX_SIMD_REAL_WIDTH, mz_S);
+ gmx_simd_store_r(dr + 3*GMX_SIMD_REAL_WIDTH, nx_S);
+ gmx_simd_store_r(dr + 4*GMX_SIMD_REAL_WIDTH, ny_S);
+ gmx_simd_store_r(dr + 5*GMX_SIMD_REAL_WIDTH, nz_S);
+
+ iu = i;
+ s = 0;
+ do
+ {
+ do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
+ p[s], q[s],
+ dr[ XX *GMX_SIMD_REAL_WIDTH+s],
+ dr[ YY *GMX_SIMD_REAL_WIDTH+s],
+ dr[ ZZ *GMX_SIMD_REAL_WIDTH+s],
+ dr[(DIM+XX)*GMX_SIMD_REAL_WIDTH+s],
+ dr[(DIM+YY)*GMX_SIMD_REAL_WIDTH+s],
+ dr[(DIM+ZZ)*GMX_SIMD_REAL_WIDTH+s],
+ f);
+ s++;
+ iu += nfa1;
+ }
+ while (s < GMX_SIMD_REAL_WIDTH && iu < nbonds);
+ }
+}
+
#endif /* GMX_SIMD_HAVE_REAL */
global_atom_index);
v = 0;
}
+#ifdef GMX_SIMD_HAVE_REAL
+ else if (ftype == F_RBDIHS &&
+ !bCalcEnerVir && fr->efep == efepNO)
+ {
+ /* No energies, shift forces, dvdl */
+ rbdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+ idef->iparams,
+ (const rvec*)x, f,
+ pbc, g, lambda[efptFTYPE], md, fcd,
+ global_atom_index);
+ v = 0;
+ }
+#endif
else
{
v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
/* The DLB option */
int eDLB;
+ /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+ gmx_bool bDLB_locked;
/* Are we actually using DLB? */
gmx_bool bDynLoadBal;
int eFlop;
double flop;
int flop_n;
- /* Have often have did we have load measurements */
+ /* How many times have did we have load measurements */
int n_load_have;
- /* Have often have we collected the load measurements */
+ /* How many times have we collected the load measurements */
int n_load_collect;
/* Statistics */
cell_size[i] = 1.0/ncd;
}
}
- else if (dd_load_count(comm))
+ else if (dd_load_count(comm) > 0)
{
load_aver = comm->load[d].sum_m/ncd;
change_max = 0;
static void print_cg_move(FILE *fplog,
gmx_domdec_t *dd,
gmx_int64_t step, int cg, int dim, int dir,
- gmx_bool bHaveLimitdAndCMOld, real limitd,
+ gmx_bool bHaveCgcmOld, real limitd,
rvec cm_old, rvec cm_new, real pos_d)
{
gmx_domdec_comm_t *comm;
comm = dd->comm;
fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
- if (bHaveLimitdAndCMOld)
+ if (limitd > 0)
{
- fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+ fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+ dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
}
else
{
- fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
+ /* We don't have a limiting distance available: don't print it */
+ fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
+ dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
}
fprintf(fplog, "distance out of cell %f\n",
dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
- if (bHaveLimitdAndCMOld)
+ if (bHaveCgcmOld)
{
fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
cm_old[XX], cm_old[YY], cm_old[ZZ]);
static void cg_move_error(FILE *fplog,
gmx_domdec_t *dd,
gmx_int64_t step, int cg, int dim, int dir,
- gmx_bool bHaveLimitdAndCMOld, real limitd,
+ gmx_bool bHaveCgcmOld, real limitd,
rvec cm_old, rvec cm_new, real pos_d)
{
if (fplog)
{
print_cg_move(fplog, dd, step, cg, dim, dir,
- bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+ bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
}
print_cg_move(stderr, dd, step, cg, dim, dir,
- bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+ bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
gmx_fatal(FARGS,
- "A charge group moved too far between two domain decomposition steps\n"
- "This usually means that your system is not well equilibrated");
+ "%s moved too far between two domain decomposition steps\n"
+ "This usually means that your system is not well equilibrated",
+ dd->comm->bCGs ? "A charge group" : "An atom");
}
static void rotate_state_atom(t_state *state, int a)
{
if (pos_d >= limit1[d])
{
- cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
+ cg_move_error(fplog, dd, step, cg, d, 1,
+ cg_cm != state->x, limitd[d],
cg_cm[cg], cm_new, pos_d);
}
dev[d] = 1;
{
if (pos_d < limit0[d])
{
- cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
+ cg_move_error(fplog, dd, step, cg, d, -1,
+ cg_cm != state->x, limitd[d],
cg_cm[cg], cm_new, pos_d);
}
dev[d] = -1;
{
cg_move_error(fplog, dd, step, cg, dim,
(flag & DD_FLAG_FW(d)) ? 1 : 0,
- FALSE, 0,
+ fr->cutoff_scheme == ecutsGROUP, 0,
comm->vbuf.v[buf_pos],
comm->vbuf.v[buf_pos],
comm->vbuf.v[buf_pos][dim]);
/* Initialize to GPU share count to 0, might change later */
comm->nrank_gpu_shared = 0;
- comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+ comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+ comm->bDLB_locked = FALSE;
comm->bDynLoadBal = (comm->eDLB == edlbYES);
if (fplog)
comm->cellsize_limit = 0;
comm->bBondComm = FALSE;
+ /* Atoms should be able to move by up to half the list buffer size (if > 0)
+ * within nstlist steps. Since boundaries are allowed to displace by half
+ * a cell size, DD cells should be at least the size of the list buffer.
+ */
+ comm->cellsize_limit = std::max(comm->cellsize_limit,
+ ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
+
if (comm->bInterCGBondeds)
{
if (comm_distance_min > 0)
comm->PMELoadBal_max_cutoff = comm->cutoff;
}
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+{
+ return dd->comm->bDLB_locked;
+}
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+{
+ /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+ if (dd->comm->eDLB == edlbAUTO)
+ {
+ dd->comm->bDLB_locked = bValue;
+ }
+}
+
static void merge_cg_buffers(int ncell,
gmx_domdec_comm_dim_t *cd, int pulse,
int *ncg_cell,
}
/* Check if we have recorded loads on the nodes */
- if (comm->bRecordLoad && dd_load_count(comm))
+ if (comm->bRecordLoad && dd_load_count(comm) > 0)
{
- if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+ if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
{
/* Check if we should use DLB at the second partitioning
* and every 100 partitionings,
* so the extra communication cost is negligible.
*/
- n = std::max(100, nstglobalcomm);
+ const int nddp_chk_dlb = 100;
bCheckDLB = (comm->n_load_collect == 0 ||
- comm->n_load_have % n == n-1);
+ comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
}
else
{
/* Since the timings are node dependent, the master decides */
if (DDMASTER(dd))
{
- bTurnOnDLB =
- (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+ /* Here we check if the max PME rank load is more than 0.98
+ * the max PP force load. If so, PP DLB will not help,
+ * since we are (almost) limited by PME. Furthermore,
+ * DLB will cause a significant extra x/f redistribution
+ * cost on the PME ranks, which will then surely result
+ * in lower total performance.
+ * This check might be fragile, since one measurement
+ * below 0.98 (although only done once every 100 DD part.)
+ * could turn on DLB for the rest of the run.
+ */
+ if (cr->npmenodes > 0 &&
+ dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+ {
+ bTurnOnDLB = FALSE;
+ }
+ else
+ {
+ bTurnOnDLB =
+ (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+ }
if (debug)
{
fprintf(debug, "step %s, imb loss %f\n",
int nthreads, f_thread_t *f_t)
{
int t, i;
+ int nthreads_loop gmx_unused;
/* This reduction can run over any number of threads */
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+ nthreads_loop = gmx_omp_nthreads_get(emntBonded);
+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
for (i = 0; i < n; i++)
{
for (t = 1; t < nthreads; t++)
t_grpopts *opts;
gmx_groups_t *groups;
gmx_molblock_t *molblock;
+ int nthreads gmx_unused;
bLJPME = EVDW_PME(ir->vdwtype);
alook = gmx_mtop_atomlookup_init(mtop);
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static)
+ nthreads = gmx_omp_nthreads_get(emntDefault);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
for (i = 0; i < md->nr; i++)
{
int g, ag, molb;
int start, end;
rvec *x1, *x2;
real dvdl_constr;
+ int nthreads gmx_unused;
s1 = &ems1->s;
s2 = &ems2->s;
x1 = s1->x;
x2 = s2->x;
-#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+ nthreads = gmx_omp_nthreads_get(emntUpdate);
+#pragma omp parallel num_threads(nthreads)
{
int gf, i, m;
nbnxn_cu_kfunc_ptr_t res;
assert(eeltype < eelCuNR);
- assert(evdwtype < eelCuNR);
+ assert(evdwtype < evdwCuNR);
if (bDoEne)
{
/* Analytical Ewald interaction kernels with twin-range cut-off
*/
#define EL_EWALD_ANA
-#define LJ_CUTOFF_CHECK
+#define VDW_CUTOFF_CHECK
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
#undef NB_KERNEL_FUNC_NAME
#undef EL_EWALD_ANA
-#undef LJ_CUTOFF_CHECK
+#undef VDW_CUTOFF_CHECK
/* Tabulated Ewald interaction kernels */
/* Tabulated Ewald interaction kernels with twin-range cut-off */
#define EL_EWALD_TAB
-#define LJ_CUTOFF_CHECK
+#define VDW_CUTOFF_CHECK
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
#undef NB_KERNEL_FUNC_NAME
#undef EL_EWALD_TAB
-#undef LJ_CUTOFF_CHECK
+#undef VDW_CUTOFF_CHECK
nbnxn_pairlist_t **nbl;
int coulkt, vdwkt = 0;
int nb;
+ int nthreads gmx_unused;
nnbl = nbl_list->nnbl;
nbl = nbl_list->nbl;
gmx_incons("Unsupported VdW interaction type");
}}
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
for (nb = 0; nb < nnbl; nb++)
{{
nbnxn_atomdata_output_t *out;
int coult;
int vdwt;
int nb;
+ int nthreads gmx_unused;
nnbl = nbl_list->nnbl;
nbl = nbl_list->nbl;
gmx_incons("Unsupported vdwtype in nbnxn reference kernel");
}
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
for (nb = 0; nb < nnbl; nb++)
{
nbnxn_atomdata_output_t *out;
nbnxn_pairlist_t **nbl;
int coulkt, vdwkt = 0;
int nb;
+ int nthreads gmx_unused;
nnbl = nbl_list->nnbl;
nbl = nbl_list->nbl;
gmx_incons("Unsupported VdW interaction type");
}
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
for (nb = 0; nb < nnbl; nb++)
{
nbnxn_atomdata_output_t *out;
nbnxn_pairlist_t **nbl;
int coulkt, vdwkt = 0;
int nb;
+ int nthreads gmx_unused;
nnbl = nbl_list->nnbl;
nbl = nbl_list->nbl;
gmx_incons("Unsupported VdW interaction type");
}
-#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+ nthreads = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for schedule(static) num_threads(nthreads)
for (nb = 0; nb < nnbl; nb++)
{
nbnxn_atomdata_output_t *out;
float *bbcz;
nbnxn_bb_t *bb;
int ncd, sc;
+ int nthreads gmx_unused;
grid = &nbs->grid[0];
bbcz = grid->bbcz_simple;
bb = grid->bb_simple;
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+ nthreads = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
for (sc = 0; sc < grid->nc; sc++)
{
int c, tx, na;
{
int nsci, ncj4, nexcl;
int n, i;
+ int nthreads gmx_unused;
if (nblc->bSimple)
{
/* Each thread should copy its own data to the combined arrays,
* as otherwise data will go back and forth between different caches.
*/
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+ nthreads = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
for (n = 0; n < nnbl; n++)
{
int sci_offset;
nlist->jindex[nri+1] = nlist->nrj;
len = nlist->nrj - nlist->jindex[nri];
+ /* If there are no j-particles we have to reduce the
+ * number of i-particles again, to prevent errors in the
+ * kernel functions.
+ */
+ if ((len == 0) && (nlist->nri > 0))
+ {
+ nlist->nri--;
+ }
}
}
if (bDoForces && DOMAINDECOMP(cr))
{
+ if (bUseGPU)
+ {
+ /* We are done with the CPU compute, but the GPU local non-bonded
+ * kernel can still be running while we communicate the forces.
+ * We start a counter here, so we can, hopefully, time the rest
+ * of the GPU kernel execution and data transfer.
+ */
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+ }
+
/* Communicate the forces */
wallcycle_start(wcycle, ewcMOVEF);
dd_move_f(cr->dd, f, fr->fshift);
/* wait for local forces (or calculate in emulation mode) */
if (bUseGPU)
{
+ float cycles_tmp, cycles_wait_est;
+ const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
nbnxn_cuda_wait_gpu(nbv->cu_nbv,
nbv->grp[eintLocal].nbat,
flags, eatLocal,
enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
fr->fshift);
- cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+ if (bDoForces && DOMAINDECOMP(cr))
+ {
+ cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+
+ if (cycles_tmp < cuda_api_overhead_margin)
+ {
+ /* We measured few cycles, it could be that the kernel
+ * and transfer finished earlier and there was no actual
+ * wait time, only API call overhead.
+ * Then the actual time could be anywhere between 0 and
+ * cycles_wait_est. As a compromise, we use half the time.
+ */
+ cycles_wait_est *= 0.5f;
+ }
+ }
+ else
+ {
+ /* No force communication so we actually timed the wait */
+ cycles_wait_est = cycles_tmp;
+ }
+ /* Even though this is after dd_move_f, the actual task we are
+ * waiting for runs asynchronously with dd_move_f and we usually
+ * have nothing to balance it with, so we can and should add
+ * the time to the force time for load balancing.
+ */
+ cycles_force += cycles_wait_est;
+ cycles_wait_gpu += cycles_wait_est;
/* now clear the GPU outputs while we finish the step on the CPU */
}
else
{
-#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+ nth = gmx_omp_nthreads_get(emntUpdate);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
for (i = start; i < nrend; i++)
{
copy_rvec(upd->xp[i], state->x[i]);
"DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
"Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
"PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
- "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+ "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
"Vsite spread", "COM pull force",
"Write traj.", "Update", "Constraints", "Comm. energies",
"Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
wcc = wc->wcc;
+ /* The GPU wait estimate counter is used for load balancing only
+ * and will mess up the total due to double counting: clear it.
+ */
+ wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
+ wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
+
for (i = 0; i < ewcNR; i++)
{
if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
- ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+ ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcWAIT_GPU_NB_L_EST, ewcNB_XF_BUF_OPS,
ewcVSITESPREAD, ewcPULLPOT,
ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
ewcTEST, ewcNR
}
dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
+ if (bPMETuneRunning &&
+ use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+ !(cr->duty & DUTY_PME))
+ {
+ /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+ * With GPUs + separate PME ranks, we don't want DLB.
+ * This could happen when we scan coarse grids and
+ * it would then never be turned off again.
+ * This would hurt performance at the final, optimal
+ * grid spacing, where DLB almost never helps.
+ * Also, DLB can limit the cut-off for PME tuning.
+ */
+ dd_dlb_set_lock(cr->dd, TRUE);
+ }
+
if (bPMETuneRunning || step_rel > ir->nstlist*50)
{
bPMETuneTry = FALSE;
{
calc_enervirdiff(NULL, ir->eDispCorr, fr);
}
+
+ if (!bPMETuneRunning &&
+ DOMAINDECOMP(cr) &&
+ dd_dlb_is_locked(cr->dd))
+ {
+ /* Unlock the DLB=auto, DLB is allowed to activate
+ * (but we don't expect it to activate in most cases).
+ */
+ dd_dlb_set_lock(cr->dd, FALSE);
+ }
}
cycles_pmes = 0;
}
while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
set->rcut_coulomb = pme_lb->cut_spacing*sp;
+ if (set->rcut_coulomb < pme_lb->rcut_coulomb_start)
+ {
+ /* This is unlikely, but can happen when e.g. continuing from
+ * a checkpoint after equilibration where the box shrank a lot.
+ * We want to avoid rcoulomb getting smaller than rvdw
+ * and there might be more issues with decreasing rcoulomb.
+ */
+ set->rcut_coulomb = pme_lb->rcut_coulomb_start;
+ }
if (pme_lb->cutoff_scheme == ecutsVERLET)
{