#include "config.h"
-#include <assert.h>
-#include <math.h>
#include <stdio.h>
#include <string.h>
+#include <cmath>
+#include <cstdint>
+
#include <array>
+#include "gromacs/awh/awh.h"
+#include "gromacs/domdec/dlbtiming.h"
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/gmxlib/chargegroup.h"
-#include "gromacs/gmxlib/md_logging.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/imd/imd.h"
#include "gromacs/listed-forces/bonded.h"
#include "gromacs/listed-forces/disre.h"
#include "gromacs/mdlib/qmmm.h"
#include "gromacs/mdlib/update.h"
#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
-#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
-#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/iforceprovider.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/mshift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/timing/wallcyclereporting.h"
#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
#include "gromacs/utility/pleasecite.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/sysinfo.h"
#include "nbnxn_gpu.h"
+#include "nbnxn_kernels/nbnxn_kernel_cpu.h"
+#include "nbnxn_kernels/nbnxn_kernel_prune.h"
+
+// TODO: this environment variable allows us to verify before release
+// that on less common architectures the total cost of polling is not larger than
+// a blocking wait (so polling does not introduce overhead when the static
+// PME-first ordering would suffice).
+static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
+
void print_time(FILE *out,
gmx_walltime_accounting_t walltime_accounting,
walltime_accounting_get_start_time_stamp(walltime_accounting));
}
-static void sum_forces(int start, int end, rvec f[], rvec flr[])
+static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
{
- int i;
+ const int end = forceToAdd.size();
- if (gmx_debug_at)
- {
- pr_rvecs(debug, 0, "fsr", f+start, end-start);
- pr_rvecs(debug, 0, "flr", flr+start, end-start);
- }
// cppcheck-suppress unreadVariable
int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
#pragma omp parallel for num_threads(nt) schedule(static)
- for (i = start; i < end; i++)
+ for (int i = 0; i < end; i++)
{
- rvec_inc(f[i], flr[i]);
+ rvec_inc(f[i], forceToAdd[i]);
}
}
-/*
- * calc_f_el calculates forces due to an electric field.
- *
- * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
- *
- * Et[] contains the parameters for the time dependent
- * part of the field.
- * Ex[] contains the parameters for
- * the spatial dependent part of the field.
- * The function should return the energy due to the electric field
- * (if any) but for now returns 0.
- *
- * WARNING:
- * There can be problems with the virial.
- * Since the field is not self-consistent this is unavoidable.
- * For neutral molecules the virial is correct within this approximation.
- * For neutral systems with many charged molecules the error is small.
- * But for systems with a net charge or a few charged molecules
- * the error can be significant when the field is high.
- * Solution: implement a self-consistent electric field into PME.
- */
-static void calc_f_el(FILE *fp, int start, int homenr,
- real charge[], rvec f[],
- t_cosines Ex[], t_cosines Et[], double t)
+static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
+ ForceWithVirial *forceWithVirial,
+ ArrayRef<const gmx::RVec> pmeForces,
+ gmx_enerdata_t *enerd,
+ const tensor vir_Q,
+ real Vlr_q)
{
- rvec Ext;
- real t0;
- int i, m;
-
- for (m = 0; (m < DIM); m++)
- {
- if (Et[m].n > 0)
- {
- if (Et[m].n == 3)
- {
- t0 = Et[m].a[1];
- Ext[m] = std::cos(Et[m].a[0]*(t-t0))*std::exp(-gmx::square(t-t0)/(2.0*gmx::square(Et[m].a[2])));
- }
- else
- {
- Ext[m] = std::cos(Et[m].a[0]*t);
- }
- }
- else
- {
- Ext[m] = 1.0;
- }
- if (Ex[m].n > 0)
- {
- /* Convert the field strength from V/nm to MD-units */
- Ext[m] *= Ex[m].a[0]*FIELDFAC;
- for (i = start; (i < start+homenr); i++)
- {
- f[i][m] += charge[i]*Ext[m];
- }
- }
- else
- {
- Ext[m] = 0;
- }
- }
- if (fp != NULL)
- {
- fprintf(fp, "%10g %10g %10g %10g #FIELD\n", t,
- Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
- }
+ wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
+ GMX_ASSERT(forceWithVirial, "Invalid force pointer");
+ forceWithVirial->addVirialContribution(vir_Q);
+ enerd->term[F_COUL_RECIP] += Vlr_q;
+ sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
+ wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
}
static void calc_virial(int start, int homenr, rvec x[], rvec f[],
int i;
/* The short-range virial from surrounding boxes */
- clear_mat(vir_part);
calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
inc_nrnb(nrnb, eNR_VIRIAL, homenr);
- /* Add position restraint contribution */
- for (i = 0; i < DIM; i++)
- {
- vir_part[i][i] += fr->vir_diag_posres[i];
- }
-
/* Add wall contribution */
for (i = 0; i < DIM; i++)
{
static void pull_potential_wrapper(t_commrec *cr,
t_inputrec *ir,
matrix box, rvec x[],
- rvec f[],
- tensor vir_force,
+ ForceWithVirial *force,
t_mdatoms *mdatoms,
gmx_enerdata_t *enerd,
real *lambda,
/* Calculate the center of mass forces, this requires communication,
* which is why pull_potential is called close to other communication.
- * The virial contribution is calculated directly,
- * which is why we call pull_potential after calc_virial.
*/
wallcycle_start(wcycle, ewcPULLPOT);
set_pbc(&pbc, ir->ePBC, box);
dvdl = 0;
enerd->term[F_COM_PULL] +=
pull_potential(ir->pull_work, mdatoms, &pbc,
- cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
+ cr, t, lambda[efptRESTRAINT], x, force, &dvdl);
enerd->dvdl_lin[efptRESTRAINT] += dvdl;
wallcycle_stop(wcycle, ewcPULLPOT);
}
-static void pme_receive_force_ener(t_commrec *cr,
- gmx_wallcycle_t wcycle,
- gmx_enerdata_t *enerd,
- t_forcerec *fr)
+static void pme_receive_force_ener(t_commrec *cr,
+ ForceWithVirial *forceWithVirial,
+ gmx_enerdata_t *enerd,
+ gmx_wallcycle_t wcycle)
{
real e_q, e_lj, dvdl_q, dvdl_lj;
float cycles_ppdpme, cycles_seppme;
wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
dvdl_q = 0;
dvdl_lj = 0;
- gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
- fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
+ gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
&cycles_seppme);
enerd->term[F_COUL_RECIP] += e_q;
enerd->term[F_LJ_RECIP] += e_lj;
}
static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
- gmx_int64_t step, real pforce, rvec *x, rvec *f)
+ gmx_int64_t step, real forceTolerance,
+ const rvec *x, const rvec *f)
{
- int i;
- real pf2, fn2;
- char buf[STEPSTRSIZE];
-
- pf2 = gmx::square(pforce);
- for (i = 0; i < md->homenr; i++)
+ real force2Tolerance = gmx::square(forceTolerance);
+ std::uintmax_t numNonFinite = 0;
+ for (int i = 0; i < md->homenr; i++)
{
- fn2 = norm2(f[i]);
- /* We also catch NAN, if the compiler does not optimize this away. */
- if (fn2 >= pf2 || fn2 != fn2)
+ real force2 = norm2(f[i]);
+ bool nonFinite = !std::isfinite(force2);
+ if (force2 >= force2Tolerance || nonFinite)
+ {
+ fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
+ step,
+ ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
+ }
+ if (nonFinite)
{
- fprintf(fp, "step %s atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
- gmx_step_str(step, buf),
- ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(fn2));
+ numNonFinite++;
}
}
+ if (numNonFinite > 0)
+ {
+ /* Note that with MPI this fatal call on one rank might interrupt
+ * the printing on other ranks. But we can only avoid that with
+ * an expensive MPI barrier that we would need at each step.
+ */
+ gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
+ }
}
static void post_process_forces(t_commrec *cr,
gmx_localtop_t *top,
matrix box, rvec x[],
rvec f[],
+ ForceWithVirial *forceWithVirial,
tensor vir_force,
t_mdatoms *mdatoms,
t_graph *graph,
t_forcerec *fr, gmx_vsite_t *vsite,
int flags)
{
- if (fr->bF_NoVirSum)
+ if (fr->haveDirectVirialContributions)
{
+ rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
+
if (vsite)
{
/* Spread the mesh force on virtual sites to the other particles...
* if the constructing atoms aren't local.
*/
wallcycle_start(wcycle, ewcVSITESPREAD);
- spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
- (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
+ matrix virial = { { 0 } };
+ spread_vsite_f(vsite, x, fDirectVir, nullptr,
+ (flags & GMX_FORCE_VIRIAL), virial,
nrnb,
&top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ forceWithVirial->addVirialContribution(virial);
wallcycle_stop(wcycle, ewcVSITESPREAD);
}
+
if (flags & GMX_FORCE_VIRIAL)
{
/* Now add the forces, this is local */
- if (fr->bDomDec)
- {
- sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
- }
- else
- {
- sum_forces(0, mdatoms->homenr,
- f, fr->f_novirsum);
- }
- if (EEL_FULL(fr->eeltype))
- {
- /* Add the mesh contribution to the virial */
- m_add(vir_force, fr->vir_el_recip, vir_force);
- }
- if (EVDW_PME(fr->vdwtype))
- {
- /* Add the mesh contribution to the virial */
- m_add(vir_force, fr->vir_lj_recip, vir_force);
- }
+ sum_forces(f, forceWithVirial->force_);
+
+ /* Add the direct virial contributions */
+ GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
+ m_add(vir_force, forceWithVirial->getVirial(), vir_force);
+
if (debug)
{
pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
gmx_enerdata_t *enerd,
int flags, int ilocality,
int clearF,
+ gmx_int64_t step,
t_nrnb *nrnb,
gmx_wallcycle_t wcycle)
{
- int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
- nonbonded_verlet_group_t *nbvg;
- gmx_bool bUsingGpuKernels;
-
if (!(flags & GMX_FORCE_NONBONDED))
{
/* skip non-bonded calculation */
return;
}
- nbvg = &fr->nbv->grp[ilocality];
+ nonbonded_verlet_t *nbv = fr->nbv;
+ nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
/* GPU kernel launch overhead is already timed separately */
if (fr->cutoff_scheme != ecutsVERLET)
gmx_incons("Invalid cut-off scheme passed!");
}
- bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
+ bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
if (!bUsingGpuKernels)
{
+ /* When dynamic pair-list pruning is requested, we need to prune
+ * at nstlistPrune steps.
+ */
+ if (nbv->listParams->useDynamicPruning &&
+ (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
+ {
+ /* Prune the pair-list beyond fr->ic->rlistPrune using
+ * the current coordinates of the atoms.
+ */
+ wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
+ nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
+ wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
+ }
+
wallcycle_sub_start(wcycle, ewcsNONBONDED);
}
+
switch (nbvg->kernel_type)
{
case nbnxnk4x4_PlainC:
- nbnxn_kernel_ref(&nbvg->nbl_lists,
- nbvg->nbat, ic,
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
+ nbnxn_kernel_cpu(nbvg,
+ nbv->nbat,
+ ic,
fr->shift_vec,
flags,
clearF,
enerd->grpp.ener[egLJSR]);
break;
- case nbnxnk4xN_SIMD_4xN:
- nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
- nbvg->nbat, ic,
- nbvg->ewald_excl,
- fr->shift_vec,
- flags,
- clearF,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
- break;
- case nbnxnk4xN_SIMD_2xNN:
- nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
- nbvg->nbat, ic,
- nbvg->ewald_excl,
- fr->shift_vec,
- flags,
- clearF,
- fr->fshift[0],
- enerd->grpp.ener[egCOULSR],
- fr->bBHAM ?
- enerd->grpp.ener[egBHAMSR] :
- enerd->grpp.ener[egLJSR]);
- break;
-
case nbnxnk8x8x8_GPU:
- nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
+ nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
break;
case nbnxnk8x8x8_PlainC:
nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
- nbvg->nbat, ic,
+ nbv->nbat, ic,
fr->shift_vec,
flags,
clearF,
- nbvg->nbat->out[0].f,
+ nbv->nbat->out[0].f,
fr->fshift[0],
enerd->grpp.ener[egCOULSR],
fr->bBHAM ?
break;
default:
- gmx_incons("Invalid nonbonded kernel type passed!");
+ GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
}
if (!bUsingGpuKernels)
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
+ int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
}
else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
- (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
+ (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
}
dvdl_nb[i] = 0;
}
- assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
+ GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
wallcycle_sub_start(wcycle, ewcsNONBONDED);
#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
{
- return nbv != NULL && nbv->bUseGPU;
+ return nbv != nullptr && nbv->bUseGPU;
}
static gmx_inline void clear_rvecs_omp(int n, rvec v[])
}
}
-/*! \brief This routine checks if the potential energy is finite.
+/*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
+ *
+ * \param groupOptions Group options, containing T-coupling options
+ */
+static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
+{
+ real nrdfCoupled = 0;
+ real nrdfUncoupled = 0;
+ real kineticEnergy = 0;
+ for (int g = 0; g < groupOptions.ngtc; g++)
+ {
+ if (groupOptions.tau_t[g] >= 0)
+ {
+ nrdfCoupled += groupOptions.nrdf[g];
+ kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
+ }
+ else
+ {
+ nrdfUncoupled += groupOptions.nrdf[g];
+ }
+ }
+
+ /* This conditional with > also catches nrdf=0 */
+ if (nrdfCoupled > nrdfUncoupled)
+ {
+ return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+/*! \brief This routine checks that the potential energy is finite.
*
+ * Always checks that the potential energy is finite. If step equals
+ * inputrec.init_step also checks that the magnitude of the potential energy
+ * is reasonable. Terminates with a fatal error when a check fails.
* Note that passing this check does not guarantee finite forces,
* since those use slightly different arithmetics. But in most cases
* there is just a narrow coordinate range where forces are not finite
* and energies are finite.
*
- * \param[in] enerd The energy data; the non-bonded group energies need to be added in here before calling this routine
+ * \param[in] step The step number, used for checking and printing
+ * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
+ * \param[in] inputrec The input record
+ */
+static void checkPotentialEnergyValidity(gmx_int64_t step,
+ const gmx_enerdata_t &enerd,
+ const t_inputrec &inputrec)
+{
+ /* Threshold valid for comparing absolute potential energy against
+ * the kinetic energy. Normally one should not consider absolute
+ * potential energy values, but with a factor of one million
+ * we should never get false positives.
+ */
+ constexpr real c_thresholdFactor = 1e6;
+
+ bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
+ real averageKineticEnergy = 0;
+ /* We only check for large potential energy at the initial step,
+ * because that is by far the most likely step for this too occur
+ * and because computing the average kinetic energy is not free.
+ * Note: nstcalcenergy >> 1 often does not allow to catch large energies
+ * before they become NaN.
+ */
+ if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
+ {
+ averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
+ }
+
+ if (energyIsNotFinite || (averageKineticEnergy > 0 &&
+ enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
+ {
+ gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
+ step,
+ enerd.term[F_EPOT],
+ energyIsNotFinite ? "not finite" : "extremely high",
+ enerd.term[F_LJ],
+ enerd.term[F_COUL_SR],
+ energyIsNotFinite ? "non-finite" : "very high",
+ energyIsNotFinite ? " or Nan" : "");
+ }
+}
+
+/*! \brief Compute forces and/or energies for special algorithms
+ *
+ * The intention is to collect all calls to algorithms that compute
+ * forces on local atoms only and that do not contribute to the local
+ * virial sum (but add their virial contribution separately).
+ * Eventually these should likely all become ForceProviders.
+ * Within this function the intention is to have algorithms that do
+ * global communication at the end, so global barriers within the MD loop
+ * are as close together as possible.
+ *
+ * \param[in] fplog The log file
+ * \param[in] cr The communication record
+ * \param[in] inputrec The input record
+ * \param[in] step The current MD step
+ * \param[in] t The current time
+ * \param[in,out] wcycle Wallcycle accounting struct
+ * \param[in,out] forceProviders Pointer to a list of force providers
+ * \param[in] box The unit cell
+ * \param[in] x The coordinates
+ * \param[in] mdatoms Per atom properties
+ * \param[in] lambda Array of free-energy lambda values
+ * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
+ * \param[in,out] forceWithVirial Force and virial buffers
+ * \param[in,out] enerd Energy buffer
+ * \param[in,out] ed Essential dynamics pointer
+ * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
+ *
+ * \todo Remove bNS, which is used incorrectly.
+ * \todo Convert all other algorithms called here to ForceProviders.
+ */
+static void
+computeSpecialForces(FILE *fplog,
+ t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_int64_t step,
+ double t,
+ gmx_wallcycle_t wcycle,
+ ForceProviders *forceProviders,
+ matrix box,
+ rvec *x,
+ t_mdatoms *mdatoms,
+ real *lambda,
+ int forceFlags,
+ ForceWithVirial *forceWithVirial,
+ gmx_enerdata_t *enerd,
+ gmx_edsam_t ed,
+ gmx_bool bNS)
+{
+ const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
+
+ /* NOTE: Currently all ForceProviders only provide forces.
+ * When they also provide energies, remove this conditional.
+ */
+ if (computeForces)
+ {
+ /* Collect forces from modules */
+ forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
+ }
+
+ if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
+ {
+ pull_potential_wrapper(cr, inputrec, box, x,
+ forceWithVirial,
+ mdatoms, enerd, lambda, t,
+ wcycle);
+
+ if (inputrec->bDoAwh)
+ {
+ Awh &awh = *inputrec->awh;
+ enerd->term[F_COM_PULL] +=
+ awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
+ forceWithVirial,
+ t, step, wcycle, fplog);
+ }
+ }
+
+ rvec *f = as_rvec_array(forceWithVirial->force_.data());
+
+ /* Add the forces from enforced rotation potentials (if any) */
+ if (inputrec->bRot)
+ {
+ wallcycle_start(wcycle, ewcROTadd);
+ enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ wallcycle_stop(wcycle, ewcROTadd);
+ }
+
+ if (ed)
+ {
+ /* Note that since init_edsam() is called after the initialization
+ * of forcerec, edsam doesn't request the noVirSum force buffer.
+ * Thus if no other algorithm (e.g. PME) requires it, the forces
+ * here will contribute to the virial.
+ */
+ do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ }
+
+ /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
+ if (inputrec->bIMD && computeForces)
+ {
+ IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
+ }
+}
+
+/*! \brief Launch the prepare_step and spread stages of PME GPU.
+ *
+ * \param[in] pmedata The PME structure
+ * \param[in] box The box matrix
+ * \param[in] x Coordinate array
+ * \param[in] flags Force flags
+ * \param[in] wcycle The wallcycle structure
+ */
+static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
+ matrix box,
+ rvec x[],
+ int flags,
+ gmx_wallcycle_t wcycle)
+{
+ int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+ pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
+ pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
+
+ pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
+ pme_gpu_launch_spread(pmedata, x, wcycle);
+}
+
+/*! \brief Launch the FFT and gather stages of PME GPU
+ *
+ * This function only implements setting the output forces (no accumulation).
+ *
+ * \param[in] pmedata The PME structure
+ * \param[in] wcycle The wallcycle structure
*/
-static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
+static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
+ gmx_wallcycle_t wcycle)
{
- if (!std::isfinite(enerd->term[F_EPOT]))
+ pme_gpu_launch_complex_transforms(pmedata, wcycle);
+ pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
+}
+
+/*! \brief
+ * Polling wait for either of the PME or nonbonded GPU tasks.
+ *
+ * Instead of a static order in waiting for GPU tasks, this function
+ * polls checking which of the two tasks completes first, and does the
+ * associated force buffer reduction overlapped with the other task.
+ * By doing that, unlike static scheduling order, it can always overlap
+ * one of the reductions, regardless of the GPU task completion order.
+ *
+ * \param[in] nbv Nonbonded verlet structure
+ * \param[in] pmedata PME module data
+ * \param[in,out] force Force array to reduce task outputs into.
+ * \param[in,out] forceWithVirial Force and virial buffers
+ * \param[in,out] fshift Shift force output vector results are reduced into
+ * \param[in,out] enerd Energy data structure results are reduced into
+ * \param[in] flags Force flags
+ * \param[in] wcycle The wallcycle structure
+ */
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
+ const gmx_pme_t *pmedata,
+ gmx::PaddedArrayRef<gmx::RVec> *force,
+ ForceWithVirial *forceWithVirial,
+ rvec fshift[],
+ gmx_enerdata_t *enerd,
+ int flags,
+ gmx_wallcycle_t wcycle)
+{
+ bool isPmeGpuDone = false;
+ bool isNbGpuDone = false;
+
+
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+
+ while (!isPmeGpuDone || !isNbGpuDone)
{
- gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
- enerd->term[F_EPOT],
- enerd->term[F_LJ],
- enerd->term[F_COUL_SR]);
+ if (!isPmeGpuDone)
+ {
+ matrix vir_Q;
+ real Vlr_q;
+
+ GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+ isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
+ vir_Q, &Vlr_q, completionType);
+
+ if (isPmeGpuDone)
+ {
+ pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
+ enerd, vir_Q, Vlr_q);
+ }
+ }
+
+ if (!isNbGpuDone)
+ {
+ GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+ wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+ isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fshift, completionType);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ // To get the call count right, when the task finished we
+ // issue a start/stop.
+ // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
+ // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+ if (isNbGpuDone)
+ {
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ nbv->nbat, as_rvec_array(force->data()));
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+ }
+ }
+}
+
+/*! \brief
+ * Launch the dynamic rolling pruning GPU task.
+ *
+ * We currently alternate local/non-local list pruning in odd-even steps
+ * (only pruning every second step without DD).
+ *
+ * \param[in] cr The communication record
+ * \param[in] nbv Nonbonded verlet structure
+ * \param[in] inputrec The input record
+ * \param[in] step The current MD step
+ */
+static inline void launchGpuRollingPruning(const t_commrec *cr,
+ const nonbonded_verlet_t *nbv,
+ const t_inputrec *inputrec,
+ const gmx_int64_t step)
+{
+ /* We should not launch the rolling pruning kernel at a search
+ * step or just before search steps, since that's useless.
+ * Without domain decomposition we prune at even steps.
+ * With domain decomposition we alternate local and non-local
+ * pruning at even and odd steps.
+ */
+ int numRollingParts = nbv->listParams->numRollingParts;
+ GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
+ int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+ bool stepIsEven = ((stepWithCurrentList & 1) == 0);
+ if (stepWithCurrentList > 0 &&
+ stepWithCurrentList < inputrec->nstlist - 1 &&
+ (stepIsEven || DOMAINDECOMP(cr)))
+ {
+ nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+ stepIsEven ? eintLocal : eintNonlocal,
+ numRollingParts);
}
}
-void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
- t_inputrec *inputrec,
- gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
- gmx_localtop_t *top,
- gmx_groups_t gmx_unused *groups,
- matrix box, rvec x[], history_t *hist,
- rvec f[],
- tensor vir_force,
- t_mdatoms *mdatoms,
- gmx_enerdata_t *enerd, t_fcdata *fcd,
- real *lambda, t_graph *graph,
- t_forcerec *fr, interaction_const_t *ic,
- gmx_vsite_t *vsite, rvec mu_tot,
- double t, FILE *field, gmx_edsam_t ed,
- gmx_bool bBornRadii,
- int flags)
+static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_groups_t gmx_unused *groups,
+ matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
+ gmx::PaddedArrayRef<gmx::RVec> force,
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr, interaction_const_t *ic,
+ gmx_vsite_t *vsite, rvec mu_tot,
+ double t, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags,
+ DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
+ DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
{
int cg1, i, j;
- int start, homenr;
double mu[2*DIM];
gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
- gmx_bool bDiffKernels = FALSE;
rvec vzero, box_diag;
- float cycles_pme, cycles_force, cycles_wait_gpu;
- /* TODO To avoid loss of precision, float can't be used for a
- * cycle count. Build an object that can do this right and perhaps
- * also be used by gmx_wallcycle_t */
- gmx_cycles_t cycleCountBeforeLocalWorkCompletes = 0;
- nonbonded_verlet_t *nbv;
-
- cycles_force = 0;
+ float cycles_pme, cycles_wait_gpu;
+ nonbonded_verlet_t *nbv = fr->nbv;
+
+ bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ bFillGrid = (bNS && bStateChanged);
+ bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
+ bDoForces = (flags & GMX_FORCE_FORCES);
+ bUseGPU = fr->nbv->bUseGPU;
+ bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
+
+ const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
+ // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
+ const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
+ ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
+
+ /* At a search step we need to start the first balancing region
+ * somewhere early inside the step after communication during domain
+ * decomposition (and not during the previous step as usual).
+ */
+ if (bNS &&
+ ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
+ {
+ ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
+ }
+
cycles_wait_gpu = 0;
- nbv = fr->nbv;
- start = 0;
- homenr = mdatoms->homenr;
+ const int start = 0;
+ const int homenr = mdatoms->homenr;
clear_mat(vir_force);
cg1--;
}
- bStateChanged = (flags & GMX_FORCE_STATECHANGED);
- bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
- bFillGrid = (bNS && bStateChanged);
- bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
- bDoForces = (flags & GMX_FORCE_FORCES);
- bUseGPU = fr->nbv->bUseGPU;
- bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
-
if (bStateChanged)
{
update_forcerec(fr, box);
if (bCalcCGCM)
{
- put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
+ put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
inc_nrnb(nrnb, eNR_SHIFTX, homenr);
}
else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
{
- unshift_self(graph, box, x);
+ unshift_self(graph, box, as_rvec_array(x.data()));
}
}
nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
- fr->shift_vec, nbv->grp[0].nbat);
+ fr->shift_vec, nbv->nbat);
#if GMX_MPI
- if (!(cr->duty & DUTY_PME))
+ if (!thisRankHasDuty(cr, DUTY_PME))
{
- gmx_bool bBS;
- matrix boxs;
-
/* Send particle coordinates to the pme nodes.
* Since this is only implemented for domain decomposition
* and domain decomposition does not use the graph,
* we do not need to worry about shifting.
*/
-
- int pme_flags = 0;
-
wallcycle_start(wcycle, ewcPP_PMESENDX);
-
- bBS = (inputrec->nwall == 2);
- if (bBS)
- {
- copy_mat(box, boxs);
- svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
- }
-
- if (EEL_PME(fr->eeltype))
- {
- pme_flags |= GMX_PME_DO_COULOMB;
- }
-
- if (EVDW_PME(fr->vdwtype))
- {
- pme_flags |= GMX_PME_DO_LJ;
- }
-
- gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
- mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+ gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
+ lambda[efptCOUL], lambda[efptVDW],
(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
- pme_flags, step);
-
+ step);
wallcycle_stop(wcycle, ewcPP_PMESENDX);
}
#endif /* GMX_MPI */
+ if (useGpuPme)
+ {
+ launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
+ }
+
/* do gridding for pair search */
if (bNS)
{
if (graph && bStateChanged)
{
/* Calculate intramolecular shift vectors to make molecules whole */
- mk_mshift(fplog, graph, fr->ePBC, box, x);
+ mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
}
clear_rvec(vzero);
wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
0, vzero, box_diag,
- 0, mdatoms->homenr, -1, fr->cginfo, x,
- 0, NULL,
+ 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
+ 0, nullptr,
nbv->grp[eintLocal].kernel_type,
- nbv->grp[eintLocal].nbat);
+ nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
}
else
{
wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
- fr->cginfo, x,
+ fr->cginfo, as_rvec_array(x.data()),
nbv->grp[eintNonlocal].kernel_type,
- nbv->grp[eintNonlocal].nbat);
+ nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
}
- if (nbv->ngrp == 1 ||
- nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
- {
- nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
- nbv->nbs, mdatoms, fr->cginfo);
- }
- else
- {
- nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
- nbv->nbs, mdatoms, fr->cginfo);
- nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
- nbv->nbs, mdatoms, fr->cginfo);
- }
+ nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
wallcycle_stop(wcycle, ewcNS);
}
/* initialize the GPU atom data and copy shift vector */
if (bUseGPU)
{
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
if (bNS)
{
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
}
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
/* do local pair search */
{
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
- nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
+ nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
&top->excls,
- ic->rlist,
+ nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
&nbv->grp[eintLocal].nbl_lists,
eintLocal,
nbv->grp[eintLocal].kernel_type,
nrnb);
+ nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+ if (nbv->listParams->useDynamicPruning && !bUseGPU)
+ {
+ nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+ }
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
if (bUseGPU)
{
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
- nbv->grp[eintLocal].nbat);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
+ nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}
if (bUseGPU)
{
- wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
- /* launch local nonbonded F on GPU */
+ if (DOMAINDECOMP(cr))
+ {
+ ddOpenBalanceRegionGpu(cr->dd);
+ }
+
+ wallcycle_start(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ /* launch local nonbonded work on GPU */
do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
- nrnb, wcycle);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ step, nrnb, wcycle);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ }
+
+ if (useGpuPme)
+ {
+ // In PME GPU and mixed mode we launch FFT / gather after the
+ // X copy/transform to allow overlap as well as after the GPU NB
+ // launch to avoid FFT launch overhead hijacking the CPU and delaying
+ // the nonbonded kernel.
+ launchPmeGpuFftAndGather(fr->pmedata, wcycle);
}
/* Communicate coordinates and sum dipole if necessary +
do non-local pair search */
if (DOMAINDECOMP(cr))
{
- bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
- nbv->grp[eintLocal].kernel_type);
-
- if (bDiffKernels)
- {
- /* With GPU+CPU non-bonded calculations we need to copy
- * the local coordinates to the non-local nbat struct
- * (in CPU format) as the non-local kernel call also
- * calculates the local - non-local interactions.
- */
- wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
- nbv->grp[eintNonlocal].nbat);
- wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
- wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
- }
-
if (bNS)
{
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
- if (bDiffKernels)
- {
- nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
- }
-
- nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
+ nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
&top->excls,
- ic->rlist,
+ nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
&nbv->grp[eintNonlocal].nbl_lists,
eintNonlocal,
nbv->grp[eintNonlocal].kernel_type,
nrnb);
-
+ nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+ if (nbv->listParams->useDynamicPruning && !bUseGPU)
+ {
+ nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+ }
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
else
{
wallcycle_start(wcycle, ewcMOVEX);
- dd_move_x(cr->dd, box, x);
+ dd_move_x(cr->dd, box, as_rvec_array(x.data()));
wallcycle_stop(wcycle, ewcMOVEX);
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
- nbv->grp[eintNonlocal].nbat);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
+ nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
- cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}
- if (bUseGPU && !bDiffKernels)
+ if (bUseGPU)
{
- wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
- /* launch non-local nonbonded F on GPU */
+ wallcycle_start(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ /* launch non-local nonbonded tasks on GPU */
do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
- nrnb, wcycle);
- cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ step, nrnb, wcycle);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
}
if (bUseGPU)
{
/* launch D2H copy-back F */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- if (DOMAINDECOMP(cr) && !bDiffKernels)
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ if (DOMAINDECOMP(cr))
{
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
+ nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
flags, eatNonlocal);
}
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
+ nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
flags, eatLocal);
- cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
if (bStateChanged && inputrecNeedMutot(inputrec))
if (PAR(cr))
{
gmx_sumd(2*DIM, mu, cr);
+ ddReopenBalanceRegionCpu(cr->dd);
}
for (i = 0; i < 2; i++)
reset_enerdata(enerd);
clear_rvecs(SHIFTS, fr->fshift);
- if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
+ if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
{
wallcycle_start(wcycle, ewcPPDURINGPME);
dd_force_flop_start(cr->dd, nrnb);
if (inputrec->bRot)
{
- /* Enforced rotation has its own cycle counter that starts after the collective
- * coordinates have been communicated. It is added to ddCyclF to allow
- * for proper load-balancing */
wallcycle_start(wcycle, ewcROT);
- do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
wallcycle_stop(wcycle, ewcROT);
}
+ /* Temporary solution until all routines take PaddedRVecVector */
+ rvec *f = as_rvec_array(force.data());
+
/* Start the force cycle counter.
- * This counter is stopped after do_force_lowlevel.
- * No parallel communication should occur while this counter is running,
- * since that will interfere with the dynamic load balancing.
+ * Note that a different counter is used for dynamic load balancing.
*/
wallcycle_start(wcycle, ewcFORCE);
+
+ gmx::ArrayRef<gmx::RVec> forceRef = force;
if (bDoForces)
{
- /* Reset forces for which the virial is calculated separately:
- * PME/Ewald forces if necessary */
- if (fr->bF_NoVirSum)
+ /* If we need to compute the virial, we might need a separate
+ * force buffer for algorithms for which the virial is calculated
+ * directly, such as PME.
+ */
+ if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
{
- if (flags & GMX_FORCE_VIRIAL)
- {
- fr->f_novirsum = fr->f_novirsum_alloc;
- }
- else
- {
- /* We are not calculating the pressure so we do not need
- * a separate array for forces that do not contribute
- * to the pressure.
- */
- fr->f_novirsum = f;
- }
- }
+ forceRef = *fr->forceBufferForDirectVirialContributions;
- if (fr->bF_NoVirSum)
- {
- if (flags & GMX_FORCE_VIRIAL)
- {
- if (fr->bDomDec)
- {
- clear_rvecs_omp(fr->f_novirsum_n, fr->f_novirsum);
- }
- else
- {
- clear_rvecs_omp(homenr, fr->f_novirsum+start);
- }
- }
+ /* We only compute forces on local atoms. Note that vsites can
+ * spread to non-local atoms, but that part of the buffer is
+ * cleared separately in the vsite spreading code.
+ */
+ clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
}
/* Clear the short- and long-range forces */
clear_rvecs_omp(fr->natoms_force_constr, f);
-
- clear_rvec(fr->vir_diag_posres);
}
+ /* forceWithVirial uses the local atom range only */
+ ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
+
if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
{
clear_pull_forces(inputrec->pull_work);
if (!bUseOrEmulGPU)
{
- /* Maybe we should move this into do_force_lowlevel */
do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
- nrnb, wcycle);
+ step, nrnb, wcycle);
}
if (fr->efep != efepNO)
if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
{
do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
- fr, x, f, mdatoms,
+ fr, as_rvec_array(x.data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
}
fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
{
do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
- fr, x, f, mdatoms,
+ fr, as_rvec_array(x.data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
}
}
- if (!bUseOrEmulGPU || bDiffKernels)
+ if (!bUseOrEmulGPU)
{
int aloc;
if (DOMAINDECOMP(cr))
{
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
- bDiffKernels ? enbvClearFYes : enbvClearFNo,
- nrnb, wcycle);
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ step, nrnb, wcycle);
}
if (!bUseOrEmulGPU)
* This can be split into a local and a non-local part when overlapping
* communication with calculation with domain decomposition.
*/
- cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_stop(wcycle, ewcFORCE);
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
- cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_start_nocount(wcycle, ewcFORCE);
/* if there are multiple fshift output buffers reduce them */
{
/* This is not in a subcounter because it takes a
negligible and constant-sized amount of time */
- nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
fr->fshift);
}
}
/* update QMMMrec, if necessary */
if (fr->bQMMM)
{
- update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
}
/* Compute the bonded and non-bonded energies and optionally forces */
do_force_lowlevel(fr, inputrec, &(top->idef),
cr, nrnb, wcycle, mdatoms,
- x, hist, f, enerd, fcd, top, fr->born,
+ as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
bBornRadii, box,
inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
flags, &cycles_pme);
- cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_stop(wcycle, ewcFORCE);
- if (ed)
- {
- do_flood(cr, inputrec, x, f, ed, box, step, bNS);
- }
+ computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
+ fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
+ flags, &forceWithVirial, enerd,
+ ed, bNS);
- if (bUseOrEmulGPU && !bDiffKernels)
+ if (bUseOrEmulGPU)
{
/* wait for non-local forces (or calculate in emulation mode) */
if (DOMAINDECOMP(cr))
{
if (bUseGPU)
{
- float cycles_tmp;
-
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
- flags, eatNonlocal,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
- cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
- cycles_wait_gpu += cycles_tmp;
- cycles_force += cycles_tmp;
+ nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, eatNonlocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
}
else
{
wallcycle_start_nocount(wcycle, ewcFORCE);
do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
- nrnb, wcycle);
- cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ step, nrnb, wcycle);
+ wallcycle_stop(wcycle, ewcFORCE);
}
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
{
nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
- nbv->grp[eintNonlocal].nbat, f);
+ nbv->nbat, f);
}
wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
- cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}
}
- if (bDoForces && DOMAINDECOMP(cr))
+ if (DOMAINDECOMP(cr))
{
- if (bUseGPU)
+ /* We are done with the CPU compute.
+ * We will now communicate the non-local forces.
+ * If we use a GPU this will overlap with GPU work, so in that case
+ * we do not close the DD force balancing region here.
+ */
+ if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
{
- /* We are done with the CPU compute, but the GPU local non-bonded
- * kernel can still be running while we communicate the forces.
- * We start a counter here, so we can, hopefully, time the rest
- * of the GPU kernel execution and data transfer.
- */
- cycleCountBeforeLocalWorkCompletes = gmx_cycles_read();
+ ddCloseBalanceRegionCpu(cr->dd);
}
+ if (bDoForces)
+ {
+ wallcycle_start(wcycle, ewcMOVEF);
+ dd_move_f(cr->dd, f, fr->fshift);
+ wallcycle_stop(wcycle, ewcMOVEF);
+ }
+ }
- /* Communicate the forces */
- wallcycle_start(wcycle, ewcMOVEF);
- dd_move_f(cr->dd, f, fr->fshift);
- wallcycle_stop(wcycle, ewcMOVEF);
+ // With both nonbonded and PME offloaded a GPU on the same rank, we use
+ // an alternating wait/reduction scheme.
+ bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
+ if (alternateGpuWait)
+ {
+ alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
}
- if (bUseOrEmulGPU)
+ if (!alternateGpuWait && useGpuPme)
{
- /* wait for local forces (or calculate in emulation mode) */
- if (bUseGPU)
- {
- float cycles_tmp, cycles_wait_est;
- /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
- * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
- * but even with a step of 0.1 ms the difference is less than 1%
- * of the step time.
- */
- const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+ matrix vir_Q;
+ real Vlr_q;
+ pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+ pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
+ }
- wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+ /* Wait for local GPU NB outputs on the non-alternating wait path */
+ if (!alternateGpuWait && bUseGPU)
+ {
+ /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
+ * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
+ * but even with a step of 0.1 ms the difference is less than 1%
+ * of the step time.
+ */
+ const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
+
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
flags, eatLocal,
enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
fr->fshift);
- cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-
- if (bDoForces && DOMAINDECOMP(cr))
- {
- cycles_wait_est = gmx_cycles_read() - cycleCountBeforeLocalWorkCompletes;
+ float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
- if (cycles_tmp < gpuWaitApiOverheadMargin)
- {
- /* We measured few cycles, it could be that the kernel
- * and transfer finished earlier and there was no actual
- * wait time, only API call overhead.
- * Then the actual time could be anywhere between 0 and
- * cycles_wait_est. As a compromise, we use half the time.
- */
- cycles_wait_est *= 0.5f;
- }
- }
- else
+ if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
+ {
+ DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
+ if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
{
- /* No force communication so we actually timed the wait */
- cycles_wait_est = cycles_tmp;
+ /* We measured few cycles, it could be that the kernel
+ * and transfer finished earlier and there was no actual
+ * wait time, only API call overhead.
+ * Then the actual time could be anywhere between 0 and
+ * cycles_wait_est. We will use half of cycles_wait_est.
+ */
+ waitedForGpu = DdBalanceRegionWaitedForGpu::no;
}
- /* Even though this is after dd_move_f, the actual task we are
- * waiting for runs asynchronously with dd_move_f and we usually
- * have nothing to balance it with, so we can and should add
- * the time to the force time for load balancing.
- */
- cycles_force += cycles_wait_est;
- cycles_wait_gpu += cycles_wait_est;
-
- /* now clear the GPU outputs while we finish the step on the CPU */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
}
- else
+ }
+
+ if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
+ {
+ // NOTE: emulation kernel is not included in the balancing region,
+ // but emulation mode does not target performance anyway
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+ step, nrnb, wcycle);
+ wallcycle_stop(wcycle, ewcFORCE);
+ }
+
+ if (bUseGPU)
+ {
+ /* now clear the GPU outputs while we finish the step on the CPU */
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+
+ /* Is dynamic pair-list pruning activated? */
+ if (nbv->listParams->useDynamicPruning)
{
- wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal,
- DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
- nrnb, wcycle);
- wallcycle_stop(wcycle, ewcFORCE);
+ launchGpuRollingPruning(cr, nbv, inputrec, step);
}
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+
+ // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
+ }
+
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
+ * on the non-alternating path. */
+ if (bUseOrEmulGPU && !alternateGpuWait)
+ {
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
- nbv->grp[eintLocal].nbat, f);
+ nbv->nbat, f);
wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}
if (DOMAINDECOMP(cr))
{
dd_force_flop_stop(cr->dd, nrnb);
- if (wcycle)
- {
- dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
- if (bUseGPU)
- {
- dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
- }
- }
}
if (bDoForces)
{
- if (inputrecElecField(inputrec))
- {
- /* Compute forces due to electric field */
- calc_f_el(MASTER(cr) ? field : NULL,
- start, homenr, mdatoms->chargeA, fr->f_novirsum,
- inputrec->ex, inputrec->et, t);
- }
-
/* If we have NoVirSum forces, but we do not calculate the virial,
* we sum fr->f_novirsum=f later.
*/
- if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
{
wallcycle_start(wcycle, ewcVSITESPREAD);
- spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
&top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
wallcycle_stop(wcycle, ewcVSITESPREAD);
}
if (flags & GMX_FORCE_VIRIAL)
{
/* Calculation of the virial must be done after vsites! */
- calc_virial(0, mdatoms->homenr, x, f,
+ calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
vir_force, graph, box, nrnb, fr, inputrec->ePBC);
}
}
- if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
- {
- /* Since the COM pulling is always done mass-weighted, no forces are
- * applied to vsites and this call can be done after vsite spreading.
- */
- pull_potential_wrapper(cr, inputrec, box, x,
- f, vir_force, mdatoms, enerd, lambda, t,
- wcycle);
- }
-
- /* Add the forces from enforced rotation potentials (if any) */
- if (inputrec->bRot)
- {
- wallcycle_start(wcycle, ewcROTadd);
- enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
- wallcycle_stop(wcycle, ewcROTadd);
- }
-
- /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
- IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
-
- if (PAR(cr) && !(cr->duty & DUTY_PME))
+ if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
{
/* In case of node-splitting, the PP nodes receive the long-range
* forces, virial and energy from the PME nodes here.
*/
- pme_receive_force_ener(cr, wcycle, enerd, fr);
+ pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
}
if (bDoForces)
{
post_process_forces(cr, step, nrnb, wcycle,
- top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ top, box, as_rvec_array(x.data()), f, &forceWithVirial,
+ vir_force, mdatoms, graph, fr, vsite,
flags);
}
if (!EI_TPI(inputrec->eI))
{
- checkPotentialEnergyValidity(enerd);
+ checkPotentialEnergyValidity(step, *enerd, *inputrec);
}
}
}
-void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
- t_inputrec *inputrec,
- gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
- gmx_localtop_t *top,
- gmx_groups_t *groups,
- matrix box, rvec x[], history_t *hist,
- rvec f[],
- tensor vir_force,
- t_mdatoms *mdatoms,
- gmx_enerdata_t *enerd, t_fcdata *fcd,
- real *lambda, t_graph *graph,
- t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
- double t, FILE *field, gmx_edsam_t ed,
- gmx_bool bBornRadii,
- int flags)
+static void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_groups_t *groups,
+ matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
+ gmx::PaddedArrayRef<gmx::RVec> force,
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
+ double t, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags,
+ DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
+ DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
{
int cg0, cg1, i, j;
- int start, homenr;
double mu[2*DIM];
gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
gmx_bool bDoForces;
- float cycles_pme, cycles_force;
+ float cycles_pme;
- start = 0;
- homenr = mdatoms->homenr;
+ const int start = 0;
+ const int homenr = mdatoms->homenr;
clear_mat(vir_force);
if (bCalcCGCM)
{
put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
- &(top->cgs), x, fr->cg_cm);
+ &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
inc_nrnb(nrnb, eNR_CGCM, homenr);
inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
}
else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
{
- unshift_self(graph, box, x);
+ unshift_self(graph, box, as_rvec_array(x.data()));
}
}
else if (bCalcCGCM)
{
- calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
+ calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
inc_nrnb(nrnb, eNR_CGCM, homenr);
}
}
#if GMX_MPI
- if (!(cr->duty & DUTY_PME))
+ if (!thisRankHasDuty(cr, DUTY_PME))
{
- gmx_bool bBS;
- matrix boxs;
-
/* Send particle coordinates to the pme nodes.
* Since this is only implemented for domain decomposition
* and domain decomposition does not use the graph,
* we do not need to worry about shifting.
*/
-
- int pme_flags = 0;
-
wallcycle_start(wcycle, ewcPP_PMESENDX);
-
- bBS = (inputrec->nwall == 2);
- if (bBS)
- {
- copy_mat(box, boxs);
- svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
- }
-
- if (EEL_PME(fr->eeltype))
- {
- pme_flags |= GMX_PME_DO_COULOMB;
- }
-
- if (EVDW_PME(fr->vdwtype))
- {
- pme_flags |= GMX_PME_DO_LJ;
- }
-
- gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
- mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+ gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
+ lambda[efptCOUL], lambda[efptVDW],
(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
- pme_flags, step);
-
+ step);
wallcycle_stop(wcycle, ewcPP_PMESENDX);
}
#endif /* GMX_MPI */
if (DOMAINDECOMP(cr))
{
wallcycle_start(wcycle, ewcMOVEX);
- dd_move_x(cr->dd, box, x);
+ dd_move_x(cr->dd, box, as_rvec_array(x.data()));
wallcycle_stop(wcycle, ewcMOVEX);
+ /* No GPU support, no move_x overlap, so reopen the balance region here */
+ if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
+ {
+ ddReopenBalanceRegionCpu(cr->dd);
+ }
}
if (inputrecNeedMutot(inputrec))
if (PAR(cr))
{
gmx_sumd(2*DIM, mu, cr);
+ ddReopenBalanceRegionCpu(cr->dd);
}
for (i = 0; i < 2; i++)
{
if (graph && bStateChanged)
{
/* Calculate intramolecular shift vectors to make molecules whole */
- mk_mshift(fplog, graph, fr->ePBC, box, x);
+ mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
}
/* Do the actual neighbour searching */
if (inputrec->implicit_solvent && bNS)
{
make_gb_nblist(cr, inputrec->gb_algorithm,
- x, box, fr, &top->idef, graph, fr->born);
+ as_rvec_array(x.data()), box, fr, &top->idef, graph, fr->born);
}
- if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
+ if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
{
wallcycle_start(wcycle, ewcPPDURINGPME);
dd_force_flop_start(cr->dd, nrnb);
if (inputrec->bRot)
{
- /* Enforced rotation has its own cycle counter that starts after the collective
- * coordinates have been communicated. It is added to ddCyclF to allow
- * for proper load-balancing */
wallcycle_start(wcycle, ewcROT);
- do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
wallcycle_stop(wcycle, ewcROT);
}
+ /* Temporary solution until all routines take PaddedRVecVector */
+ rvec *f = as_rvec_array(force.data());
+
/* Start the force cycle counter.
- * This counter is stopped after do_force_lowlevel.
- * No parallel communication should occur while this counter is running,
- * since that will interfere with the dynamic load balancing.
+ * Note that a different counter is used for dynamic load balancing.
*/
wallcycle_start(wcycle, ewcFORCE);
+ gmx::ArrayRef<gmx::RVec> forceRef = force;
if (bDoForces)
{
- /* Reset forces for which the virial is calculated separately:
- * PME/Ewald forces if necessary */
- if (fr->bF_NoVirSum)
+ /* If we need to compute the virial, we might need a separate
+ * force buffer for algorithms for which the virial is calculated
+ * separately, such as PME.
+ */
+ if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
{
- if (flags & GMX_FORCE_VIRIAL)
- {
- fr->f_novirsum = fr->f_novirsum_alloc;
- if (fr->bDomDec)
- {
- clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
- }
- else
- {
- clear_rvecs(homenr, fr->f_novirsum+start);
- }
- }
- else
- {
- /* We are not calculating the pressure so we do not need
- * a separate array for forces that do not contribute
- * to the pressure.
- */
- fr->f_novirsum = f;
- }
+ forceRef = *fr->forceBufferForDirectVirialContributions;
+
+ clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
}
/* Clear the short- and long-range forces */
clear_rvecs(fr->natoms_force_constr, f);
-
- clear_rvec(fr->vir_diag_posres);
}
+
+ /* forceWithVirial might need the full force atom range */
+ ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
+
if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
{
clear_pull_forces(inputrec->pull_work);
/* update QMMMrec, if necessary */
if (fr->bQMMM)
{
- update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
}
/* Compute the bonded and non-bonded energies and optionally forces */
do_force_lowlevel(fr, inputrec, &(top->idef),
cr, nrnb, wcycle, mdatoms,
- x, hist, f, enerd, fcd, top, fr->born,
+ as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
bBornRadii, box,
inputrec->fepvals, lambda,
graph, &(top->excls), fr->mu_tot,
flags,
&cycles_pme);
- cycles_force = wallcycle_stop(wcycle, ewcFORCE);
-
- if (ed)
- {
- do_flood(cr, inputrec, x, f, ed, box, step, bNS);
- }
+ wallcycle_stop(wcycle, ewcFORCE);
if (DOMAINDECOMP(cr))
{
dd_force_flop_stop(cr->dd, nrnb);
- if (wcycle)
+
+ if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
{
- dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ ddCloseBalanceRegionCpu(cr->dd);
}
}
+ computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
+ fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
+ flags, &forceWithVirial, enerd,
+ ed, bNS);
+
if (bDoForces)
{
- if (inputrecElecField(inputrec))
- {
- /* Compute forces due to electric field */
- calc_f_el(MASTER(cr) ? field : NULL,
- start, homenr, mdatoms->chargeA, fr->f_novirsum,
- inputrec->ex, inputrec->et, t);
- }
-
/* Communicate the forces */
if (DOMAINDECOMP(cr))
{
* When we do not calculate the virial, fr->f_novirsum = f,
* so we have already communicated these forces.
*/
- if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
(flags & GMX_FORCE_VIRIAL))
{
- dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr);
}
wallcycle_stop(wcycle, ewcMOVEF);
}
/* If we have NoVirSum forces, but we do not calculate the virial,
* we sum fr->f_novirsum=f later.
*/
- if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
{
wallcycle_start(wcycle, ewcVSITESPREAD);
- spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
&top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
wallcycle_stop(wcycle, ewcVSITESPREAD);
}
if (flags & GMX_FORCE_VIRIAL)
{
/* Calculation of the virial must be done after vsites! */
- calc_virial(0, mdatoms->homenr, x, f,
+ calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
vir_force, graph, box, nrnb, fr, inputrec->ePBC);
}
}
- if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
- {
- pull_potential_wrapper(cr, inputrec, box, x,
- f, vir_force, mdatoms, enerd, lambda, t,
- wcycle);
- }
-
- /* Add the forces from enforced rotation potentials (if any) */
- if (inputrec->bRot)
- {
- wallcycle_start(wcycle, ewcROTadd);
- enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
- wallcycle_stop(wcycle, ewcROTadd);
- }
-
- /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
- IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
-
- if (PAR(cr) && !(cr->duty & DUTY_PME))
+ if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
{
/* In case of node-splitting, the PP nodes receive the long-range
* forces, virial and energy from the PME nodes here.
*/
- pme_receive_force_ener(cr, wcycle, enerd, fr);
+ pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
}
if (bDoForces)
{
post_process_forces(cr, step, nrnb, wcycle,
- top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ top, box, as_rvec_array(x.data()), f, &forceWithVirial,
+ vir_force, mdatoms, graph, fr, vsite,
flags);
}
if (!EI_TPI(inputrec->eI))
{
- checkPotentialEnergyValidity(enerd);
+ checkPotentialEnergyValidity(step, *enerd, *inputrec);
}
}
gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
gmx_localtop_t *top,
gmx_groups_t *groups,
- matrix box, rvec x[], history_t *hist,
- rvec f[],
+ matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
+ gmx::PaddedArrayRef<gmx::RVec> force,
tensor vir_force,
t_mdatoms *mdatoms,
gmx_enerdata_t *enerd, t_fcdata *fcd,
- real *lambda, t_graph *graph,
+ gmx::ArrayRef<real> lambda, t_graph *graph,
t_forcerec *fr,
gmx_vsite_t *vsite, rvec mu_tot,
- double t, FILE *field, gmx_edsam_t ed,
+ double t, gmx_edsam_t ed,
gmx_bool bBornRadii,
- int flags)
+ int flags,
+ DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
+ DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
{
/* modify force flag if not doing nonbonded */
if (!fr->bNonbonded)
flags &= ~GMX_FORCE_NONBONDED;
}
+ GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
+ GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
+
switch (inputrec->cutoff_scheme)
{
case ecutsVERLET:
top,
groups,
box, x, hist,
- f, vir_force,
+ force, vir_force,
mdatoms,
enerd, fcd,
- lambda, graph,
+ lambda.data(), graph,
fr, fr->ic,
vsite, mu_tot,
- t, field, ed,
+ t, ed,
bBornRadii,
- flags);
+ flags,
+ ddOpenBalanceRegion,
+ ddCloseBalanceRegion);
break;
case ecutsGROUP:
do_force_cutsGROUP(fplog, cr, inputrec,
top,
groups,
box, x, hist,
- f, vir_force,
+ force, vir_force,
mdatoms,
enerd, fcd,
- lambda, graph,
+ lambda.data(), graph,
fr, vsite, mu_tot,
- t, field, ed,
+ t, ed,
bBornRadii,
- flags);
+ flags,
+ ddOpenBalanceRegion,
+ ddCloseBalanceRegion);
break;
default:
gmx_incons("Invalid cut-off scheme passed!");
}
+
+ /* In case we don't have constraints and are using GPUs, the next balancing
+ * region starts here.
+ * Some "special" work at the end of do_force_cuts?, such as vsite spread,
+ * virial calculation and COM pulling, is not thus not included in
+ * the balance timing, which is ok as most tasks do communication.
+ */
+ if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
+ {
+ ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
+ }
}
dvdl_dum = 0;
/* constrain the current position */
- constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
ir, cr, step, 0, 1.0, md,
- state->x, state->x, NULL,
+ as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
fr->bMolPBC, state->box,
state->lambda[efptBONDED], &dvdl_dum,
- NULL, NULL, nrnb, econqCoord);
+ nullptr, nullptr, nrnb, econqCoord);
if (EI_VV(ir->eI))
{
/* constrain the inital velocity, and save it */
/* also may be useful if we need the ekin from the halfstep for velocity verlet */
- constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
ir, cr, step, 0, 1.0, md,
- state->x, state->v, state->v,
+ as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
fr->bMolPBC, state->box,
state->lambda[efptBONDED], &dvdl_dum,
- NULL, NULL, nrnb, econqVeloc);
+ nullptr, nullptr, nrnb, econqVeloc);
}
/* constrain the inital velocities at t-dt/2 */
if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
gmx_step_str(step, buf));
}
dvdl_dum = 0;
- constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
ir, cr, step, -1, 1.0, md,
- state->x, savex, NULL,
+ as_rvec_array(state->x.data()), savex, nullptr,
fr->bMolPBC, state->box,
state->lambda[efptBONDED], &dvdl_dum,
- state->v, NULL, nrnb, econqCoord);
+ as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
for (i = start; i < end; i++)
{
fr->virdiffsix = 0;
fr->virdifftwelve = 0;
+ const interaction_const_t *ic = fr->ic;
+
if (eDispCorr != edispcNO)
{
for (i = 0; i < 2; i++)
eners[i] = 0;
virs[i] = 0;
}
- if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
- (fr->vdw_modifier == eintmodPOTSWITCH) ||
- (fr->vdw_modifier == eintmodFORCESWITCH) ||
- (fr->vdwtype == evdwSHIFT) ||
- (fr->vdwtype == evdwSWITCH))
+ if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
+ (ic->vdw_modifier == eintmodPOTSWITCH) ||
+ (ic->vdw_modifier == eintmodFORCESWITCH) ||
+ (ic->vdwtype == evdwSHIFT) ||
+ (ic->vdwtype == evdwSWITCH))
{
- if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
- (fr->vdw_modifier == eintmodFORCESWITCH) ||
- (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
+ if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
+ (ic->vdw_modifier == eintmodFORCESWITCH) ||
+ (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
{
gmx_fatal(FARGS,
"With dispersion correction rvdw-switch can not be zero "
- "for vdw-type = %s", evdw_names[fr->vdwtype]);
+ "for vdw-type = %s", evdw_names[ic->vdwtype]);
}
/* TODO This code depends on the logic in tables.c that
vdwtab = fr->dispersionCorrectionTable->data;
/* Round the cut-offs to exact table values for precision */
- ri0 = static_cast<int>(floor(fr->rvdw_switch*scale));
- ri1 = static_cast<int>(ceil(fr->rvdw*scale));
+ ri0 = static_cast<int>(floor(ic->rvdw_switch*scale));
+ ri1 = static_cast<int>(ceil(ic->rvdw*scale));
/* The code below has some support for handling force-switching, i.e.
* when the force (instead of potential) is switched over a limited
* we need to calculate the constant shift up to the point where we
* start modifying the potential.
*/
- ri0 = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
+ ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
r0 = ri0/scale;
rc3 = r0*r0*r0;
rc9 = rc3*rc3*rc3;
- if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
- (fr->vdwtype == evdwSHIFT))
+ if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
+ (ic->vdwtype == evdwSHIFT))
{
/* Determine the constant energy shift below rvdw_switch.
* Table has a scale factor since we have scaled it down to compensate
fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
}
- else if (fr->vdw_modifier == eintmodPOTSHIFT)
+ else if (ic->vdw_modifier == eintmodPOTSHIFT)
{
fr->enershiftsix = (real)(-1.0/(rc3*rc3));
fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
virs[0] += 8.0*M_PI/rc3;
virs[1] += -16.0*M_PI/(3.0*rc9);
}
- else if (fr->vdwtype == evdwCUT ||
- EVDW_PME(fr->vdwtype) ||
- fr->vdwtype == evdwUSER)
+ else if (ic->vdwtype == evdwCUT ||
+ EVDW_PME(ic->vdwtype) ||
+ ic->vdwtype == evdwUSER)
{
- if (fr->vdwtype == evdwUSER && fplog)
+ if (ic->vdwtype == evdwUSER && fplog)
{
fprintf(fplog,
"WARNING: using dispersion correction with user tables\n");
* can be used here.
*/
- rc3 = fr->rvdw*fr->rvdw*fr->rvdw;
+ rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
rc9 = rc3*rc3*rc3;
/* Contribution beyond the cut-off */
eners[0] += -4.0*M_PI/(3.0*rc3);
eners[1] += 4.0*M_PI/(9.0*rc9);
- if (fr->vdw_modifier == eintmodPOTSHIFT)
+ if (ic->vdw_modifier == eintmodPOTSHIFT)
{
/* Contribution within the cut-off */
eners[0] += -4.0*M_PI/(3.0*rc3);
{
gmx_fatal(FARGS,
"Dispersion correction is not implemented for vdw-type = %s",
- evdw_names[fr->vdwtype]);
+ evdw_names[ic->vdwtype]);
}
/* When we deprecate the group kernels the code below can go too */
- if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
+ if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
{
/* Calculate self-interaction coefficient (assuming that
* the reciprocal-space contribution is constant in the
* region that contributes to the self-interaction).
*/
- fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
+ fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
- eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
- virs[0] += gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
+ eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
+ virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
}
fr->enerdiffsix = eners[0];
}
}
-void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
- t_graph *graph, rvec x[])
-{
- if (fplog)
- {
- fprintf(fplog, "Removing pbc first time\n");
- }
- calc_shifts(box, fr->shift_vec);
- if (graph)
- {
- mk_mshift(fplog, graph, fr->ePBC, box, x);
- if (gmx_debug_at)
- {
- p_graph(debug, "do_pbc_first 1", graph);
- }
- shift_self(graph, box, x);
- /* By doing an extra mk_mshift the molecules that are broken
- * because they were e.g. imported from another software
- * will be made whole again. Such are the healing powers
- * of GROMACS.
- */
- mk_mshift(fplog, graph, fr->ePBC, box, x);
- if (gmx_debug_at)
- {
- p_graph(debug, "do_pbc_first 2", graph);
- }
- }
- if (fplog)
- {
- fprintf(fplog, "Done rmpbc\n");
- }
-}
-
-static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
const gmx_mtop_t *mtop, rvec x[],
gmx_bool bFirst)
{
else
{
/* Pass NULL iso fplog to avoid graph prints for each molecule type */
- mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
+ mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
0, molb->natoms_mol, FALSE, FALSE, graph);
for (mol = 0; mol < molb->nmol; mol++)
sfree(graph);
}
-void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
+void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
const gmx_mtop_t *mtop, rvec x[])
{
low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
}
-void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
- gmx_mtop_t *mtop, rvec x[])
+void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
+ const gmx_mtop_t *mtop, rvec x[])
{
low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
}
-void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
+void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
{
int t, nth;
nth = gmx_omp_nthreads_get(emntDefault);
{
try
{
- int offset, len;
-
- offset = (natoms*t )/nth;
- len = (natoms*(t + 1))/nth - offset;
- put_atoms_in_box(ePBC, box, len, x + offset);
+ size_t natoms = x.size();
+ size_t offset = (natoms*t )/nth;
+ size_t len = (natoms*(t + 1))/nth - offset;
+ put_atoms_in_box(ePBC, box, x.subArray(offset, len));
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
}
// TODO This can be cleaned up a lot, and move back to runner.cpp
-void finish_run(FILE *fplog, t_commrec *cr,
- t_inputrec *inputrec,
+void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
+ const t_inputrec *inputrec,
t_nrnb nrnb[], gmx_wallcycle_t wcycle,
gmx_walltime_accounting_t walltime_accounting,
nonbonded_verlet_t *nbv,
+ gmx_pme_t *pme,
gmx_bool bWriteStat)
{
- t_nrnb *nrnb_tot = NULL;
+ t_nrnb *nrnb_tot = nullptr;
double delta_t = 0;
double nbfs = 0, mflop = 0;
double elapsed_time,
if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
{
- md_print_warn(cr, fplog,
- "Simulation ended prematurely, no performance report will be written.");
+ GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
printReport = false;
}
sfree(nrnb_tot);
}
- if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+ if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
{
print_dd_statistics(cr, inputrec, fplog);
}
* to task parallelism. */
int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
int nthreads_pme = gmx_omp_nthreads_get(emntPME);
- wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
+ wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
auto cycle_sum(wallcycle_sum(cr, wcycle));
if (printReport)
{
- struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
-
- wallcycle_print(fplog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
+ auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+ gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
+ if (pme_gpu_task_enabled(pme))
+ {
+ pme_gpu_get_timings(pme, &pme_gpu_timings);
+ }
+ wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
elapsed_time_over_all_ranks,
- wcycle, cycle_sum, gputimes);
+ wcycle, cycle_sum,
+ nbnxn_gpu_timings,
+ &pme_gpu_timings);
if (EI_DYNAMICS(inputrec->eI))
{
}
}
-extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
+extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
{
/* this function works, but could probably use a logic rewrite to keep all the different
types of efep straight. */
- int i;
+ if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ {
+ return;
+ }
+
t_lambda *fep = ir->fepvals;
+ *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+ if checkpoint is set -- a kludge is in for now
+ to prevent this.*/
- if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ for (int i = 0; i < efptNR; i++)
{
- for (i = 0; i < efptNR; i++)
+ /* overwrite lambda state with init_lambda for now for backwards compatibility */
+ if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
{
- lambda[i] = 0.0;
+ lambda[i] = fep->init_lambda;
if (lam0)
{
- lam0[i] = 0.0;
+ lam0[i] = lambda[i];
}
}
- return;
- }
- else
- {
- *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
- if checkpoint is set -- a kludge is in for now
- to prevent this.*/
- for (i = 0; i < efptNR; i++)
+ else
{
- /* overwrite lambda state with init_lambda for now for backwards compatibility */
- if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+ lambda[i] = fep->all_lambda[i][*fep_state];
+ if (lam0)
{
- lambda[i] = fep->init_lambda;
- if (lam0)
- {
- lam0[i] = lambda[i];
- }
- }
- else
- {
- lambda[i] = fep->all_lambda[i][*fep_state];
- if (lam0)
- {
- lam0[i] = lambda[i];
- }
+ lam0[i] = lambda[i];
}
}
- if (ir->bSimTemp)
+ }
+ if (ir->bSimTemp)
+ {
+ /* need to rescale control temperatures to match current state */
+ for (int i = 0; i < ir->opts.ngtc; i++)
{
- /* need to rescale control temperatures to match current state */
- for (i = 0; i < ir->opts.ngtc; i++)
+ if (ir->opts.ref_t[i] > 0)
{
- if (ir->opts.ref_t[i] > 0)
- {
- ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
- }
+ ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
}
}
}
/* Send to the log the information on the current lambdas */
- if (fplog != NULL)
+ if (fplog != nullptr)
{
fprintf(fplog, "Initial vector of lambda components:[ ");
- for (i = 0; i < efptNR; i++)
+ for (const auto &l : lambda)
{
- fprintf(fplog, "%10.4f ", lambda[i]);
+ fprintf(fplog, "%10.4f ", l);
}
fprintf(fplog, "]\n");
}
void init_md(FILE *fplog,
- t_commrec *cr, t_inputrec *ir, const gmx_output_env_t *oenv,
+ t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
+ t_inputrec *ir, const gmx_output_env_t *oenv,
+ const MdrunOptions &mdrunOptions,
double *t, double *t0,
- real *lambda, int *fep_state, double *lam0,
+ t_state *globalState, double *lam0,
t_nrnb *nrnb, gmx_mtop_t *mtop,
gmx_update_t **upd,
int nfile, const t_filenm fnm[],
gmx_mdoutf_t *outf, t_mdebin **mdebin,
tensor force_vir, tensor shake_vir, rvec mu_tot,
- gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
+ gmx_bool *bSimAnn, t_vcm **vcm,
gmx_wallcycle_t wcycle)
{
int i;
}
/* Initialize lambda variables */
- initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
+ /* TODO: Clean up initialization of fep_state and lambda in t_state.
+ * We currently need to call initialize_lambdas on non-master ranks
+ * to initialize lam0.
+ */
+ if (MASTER(cr))
+ {
+ initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
+ }
+ else
+ {
+ int tmpFepState;
+ std::array<real, efptNR> tmpLambda;
+ initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
+ }
// TODO upd is never NULL in practice, but the analysers don't know that
if (upd)
}
if (*bSimAnn)
{
- update_annealing_target_temp(ir, ir->init_t, upd ? *upd : NULL);
+ update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
}
- if (vcm != NULL)
+ if (vcm != nullptr)
{
*vcm = init_vcm(fplog, &mtop->groups, ir);
}
- if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
+ if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
{
if (ir->etc == etcBERENDSEN)
{
please_cite(fplog, "Goga2012");
}
}
- if ((ir->et[XX].n > 0) || (ir->et[YY].n > 0) || (ir->et[ZZ].n > 0))
- {
- please_cite(fplog, "Caleman2008a");
- }
init_nrnb(nrnb);
if (nfile != -1)
{
- *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv, wcycle);
+ *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
- *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : mdoutf_get_fp_ene(*outf),
+ *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
mtop, ir, mdoutf_get_fp_dhdl(*outf));
}