2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
125 void print_time(FILE *out,
126 gmx_walltime_accounting_t walltime_accounting,
128 const t_inputrec *ir,
132 double dt, elapsed_seconds, time_per_step;
141 fputs(gmx::int64ToString(step).c_str(), out);
144 if ((step >= ir->nstlist))
146 double seconds_since_epoch = gmx_gettime();
147 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
149 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
155 finish = static_cast<time_t>(seconds_since_epoch + dt);
156 auto timebuf = gmx_ctime_r(&finish);
157 timebuf.erase(timebuf.find_first_of('\n'));
158 fputs(", will finish ", out);
159 fputs(timebuf.c_str(), out);
163 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
168 fprintf(out, " performance: %.1f ns/day ",
169 ir->delta_t/1000*24*60*60/time_per_step);
178 GMX_UNUSED_VALUE(cr);
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
192 time_t temp_time = static_cast<time_t>(the_time);
194 auto timebuf = gmx_ctime_r(&temp_time);
196 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
199 void print_start(FILE *fplog, const t_commrec *cr,
200 gmx_walltime_accounting_t walltime_accounting,
205 sprintf(buf, "Started %s", name);
206 print_date_and_time(fplog, cr->nodeid, buf,
207 walltime_accounting_get_start_time_stamp(walltime_accounting));
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
212 const int end = forceToAdd.size();
214 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216 for (int i = 0; i < end; i++)
218 rvec_inc(f[i], forceToAdd[i]);
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223 tensor vir_part, const t_graph *graph, const matrix box,
224 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
226 /* The short-range virial from surrounding boxes */
227 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
230 /* Calculate partial virial, for local atoms only, based on short range.
231 * Total virial is computed in global_stat, called from do_md
233 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
238 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
242 static void pull_potential_wrapper(const t_commrec *cr,
243 const t_inputrec *ir,
244 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245 gmx::ForceWithVirial *force,
246 const t_mdatoms *mdatoms,
247 gmx_enerdata_t *enerd,
250 gmx_wallcycle_t wcycle)
255 /* Calculate the center of mass forces, this requires communication,
256 * which is why pull_potential is called close to other communication.
258 wallcycle_start(wcycle, ewcPULLPOT);
259 set_pbc(&pbc, ir->ePBC, box);
261 enerd->term[F_COM_PULL] +=
262 pull_potential(ir->pull_work, mdatoms, &pbc,
263 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265 wallcycle_stop(wcycle, ewcPULLPOT);
268 static void pme_receive_force_ener(const t_commrec *cr,
269 gmx::ForceWithVirial *forceWithVirial,
270 gmx_enerdata_t *enerd,
271 gmx_wallcycle_t wcycle)
273 real e_q, e_lj, dvdl_q, dvdl_lj;
274 float cycles_ppdpme, cycles_seppme;
276 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
279 /* In case of node-splitting, the PP nodes receive the long-range
280 * forces, virial and energy from the PME nodes here.
282 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
285 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
287 enerd->term[F_COUL_RECIP] += e_q;
288 enerd->term[F_LJ_RECIP] += e_lj;
289 enerd->dvdl_lin[efptCOUL] += dvdl_q;
290 enerd->dvdl_lin[efptVDW] += dvdl_lj;
294 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
296 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
299 static void print_large_forces(FILE *fp,
307 real force2Tolerance = gmx::square(forceTolerance);
308 gmx::index numNonFinite = 0;
309 for (int i = 0; i < md->homenr; i++)
311 real force2 = norm2(f[i]);
312 bool nonFinite = !std::isfinite(force2);
313 if (force2 >= force2Tolerance || nonFinite)
315 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
317 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
324 if (numNonFinite > 0)
326 /* Note that with MPI this fatal call on one rank might interrupt
327 * the printing on other ranks. But we can only avoid that with
328 * an expensive MPI barrier that we would need at each step.
330 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
334 static void post_process_forces(const t_commrec *cr,
337 gmx_wallcycle_t wcycle,
338 const gmx_localtop_t *top,
342 gmx::ForceWithVirial *forceWithVirial,
344 const t_mdatoms *mdatoms,
345 const t_graph *graph,
346 const t_forcerec *fr,
347 const gmx_vsite_t *vsite,
350 if (fr->haveDirectVirialContributions)
352 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
356 /* Spread the mesh force on virtual sites to the other particles...
357 * This is parallellized. MPI communication is performed
358 * if the constructing atoms aren't local.
360 matrix virial = { { 0 } };
361 spread_vsite_f(vsite, x, fDirectVir, nullptr,
362 (flags & GMX_FORCE_VIRIAL) != 0, virial,
364 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365 forceWithVirial->addVirialContribution(virial);
368 if (flags & GMX_FORCE_VIRIAL)
370 /* Now add the forces, this is local */
371 sum_forces(f, forceWithVirial->force_);
373 /* Add the direct virial contributions */
374 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
379 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
384 if (fr->print_force >= 0)
386 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
390 static void do_nb_verlet(t_forcerec *fr,
391 const interaction_const_t *ic,
392 gmx_enerdata_t *enerd,
394 const Nbnxm::InteractionLocality ilocality,
398 gmx_wallcycle_t wcycle)
400 if (!(flags & GMX_FORCE_NONBONDED))
402 /* skip non-bonded calculation */
406 nonbonded_verlet_t *nbv = fr->nbv;
408 /* GPU kernel launch overhead is already timed separately */
409 if (fr->cutoff_scheme != ecutsVERLET)
411 gmx_incons("Invalid cut-off scheme passed!");
416 /* When dynamic pair-list pruning is requested, we need to prune
417 * at nstlistPrune steps.
419 if (nbv->listParams->useDynamicPruning &&
420 nbnxnIsDynamicPairlistPruningStep(*nbv, ilocality, step))
422 /* Prune the pair-list beyond fr->ic->rlistPrune using
423 * the current coordinates of the atoms.
425 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
426 nbv->dispatchPruneKernel(ilocality, fr->shift_vec);
427 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
430 wallcycle_sub_start(wcycle, ewcsNONBONDED);
433 NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
437 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
441 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
443 return nbv != nullptr && nbv->useGpu();
446 static inline void clear_rvecs_omp(int n, rvec v[])
448 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
450 /* Note that we would like to avoid this conditional by putting it
451 * into the omp pragma instead, but then we still take the full
452 * omp parallel for overhead (at least with gcc5).
456 for (int i = 0; i < n; i++)
463 #pragma omp parallel for num_threads(nth) schedule(static)
464 for (int i = 0; i < n; i++)
471 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
473 * \param groupOptions Group options, containing T-coupling options
475 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
477 real nrdfCoupled = 0;
478 real nrdfUncoupled = 0;
479 real kineticEnergy = 0;
480 for (int g = 0; g < groupOptions.ngtc; g++)
482 if (groupOptions.tau_t[g] >= 0)
484 nrdfCoupled += groupOptions.nrdf[g];
485 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
489 nrdfUncoupled += groupOptions.nrdf[g];
493 /* This conditional with > also catches nrdf=0 */
494 if (nrdfCoupled > nrdfUncoupled)
496 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
504 /*! \brief This routine checks that the potential energy is finite.
506 * Always checks that the potential energy is finite. If step equals
507 * inputrec.init_step also checks that the magnitude of the potential energy
508 * is reasonable. Terminates with a fatal error when a check fails.
509 * Note that passing this check does not guarantee finite forces,
510 * since those use slightly different arithmetics. But in most cases
511 * there is just a narrow coordinate range where forces are not finite
512 * and energies are finite.
514 * \param[in] step The step number, used for checking and printing
515 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
516 * \param[in] inputrec The input record
518 static void checkPotentialEnergyValidity(int64_t step,
519 const gmx_enerdata_t &enerd,
520 const t_inputrec &inputrec)
522 /* Threshold valid for comparing absolute potential energy against
523 * the kinetic energy. Normally one should not consider absolute
524 * potential energy values, but with a factor of one million
525 * we should never get false positives.
527 constexpr real c_thresholdFactor = 1e6;
529 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
530 real averageKineticEnergy = 0;
531 /* We only check for large potential energy at the initial step,
532 * because that is by far the most likely step for this too occur
533 * and because computing the average kinetic energy is not free.
534 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
535 * before they become NaN.
537 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
539 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
542 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
543 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
545 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
548 energyIsNotFinite ? "not finite" : "extremely high",
550 enerd.term[F_COUL_SR],
551 energyIsNotFinite ? "non-finite" : "very high",
552 energyIsNotFinite ? " or Nan" : "");
556 /*! \brief Compute forces and/or energies for special algorithms
558 * The intention is to collect all calls to algorithms that compute
559 * forces on local atoms only and that do not contribute to the local
560 * virial sum (but add their virial contribution separately).
561 * Eventually these should likely all become ForceProviders.
562 * Within this function the intention is to have algorithms that do
563 * global communication at the end, so global barriers within the MD loop
564 * are as close together as possible.
566 * \param[in] fplog The log file
567 * \param[in] cr The communication record
568 * \param[in] inputrec The input record
569 * \param[in] awh The Awh module (nullptr if none in use).
570 * \param[in] enforcedRotation Enforced rotation module.
571 * \param[in] step The current MD step
572 * \param[in] t The current time
573 * \param[in,out] wcycle Wallcycle accounting struct
574 * \param[in,out] forceProviders Pointer to a list of force providers
575 * \param[in] box The unit cell
576 * \param[in] x The coordinates
577 * \param[in] mdatoms Per atom properties
578 * \param[in] lambda Array of free-energy lambda values
579 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
580 * \param[in,out] forceWithVirial Force and virial buffers
581 * \param[in,out] enerd Energy buffer
582 * \param[in,out] ed Essential dynamics pointer
583 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
585 * \todo Remove bNS, which is used incorrectly.
586 * \todo Convert all other algorithms called here to ForceProviders.
589 computeSpecialForces(FILE *fplog,
591 const t_inputrec *inputrec,
593 gmx_enfrot *enforcedRotation,
596 gmx_wallcycle_t wcycle,
597 ForceProviders *forceProviders,
599 gmx::ArrayRef<const gmx::RVec> x,
600 const t_mdatoms *mdatoms,
603 gmx::ForceWithVirial *forceWithVirial,
604 gmx_enerdata_t *enerd,
608 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
610 /* NOTE: Currently all ForceProviders only provide forces.
611 * When they also provide energies, remove this conditional.
615 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
616 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
618 /* Collect forces from modules */
619 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
622 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
624 pull_potential_wrapper(cr, inputrec, box, x,
626 mdatoms, enerd, lambda, t,
631 enerd->term[F_COM_PULL] +=
632 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
634 t, step, wcycle, fplog);
638 rvec *f = as_rvec_array(forceWithVirial->force_.data());
640 /* Add the forces from enforced rotation potentials (if any) */
643 wallcycle_start(wcycle, ewcROTadd);
644 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
645 wallcycle_stop(wcycle, ewcROTadd);
650 /* Note that since init_edsam() is called after the initialization
651 * of forcerec, edsam doesn't request the noVirSum force buffer.
652 * Thus if no other algorithm (e.g. PME) requires it, the forces
653 * here will contribute to the virial.
655 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
658 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
659 if (inputrec->bIMD && computeForces)
661 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
665 /*! \brief Launch the prepare_step and spread stages of PME GPU.
667 * \param[in] pmedata The PME structure
668 * \param[in] box The box matrix
669 * \param[in] x Coordinate array
670 * \param[in] flags Force flags
671 * \param[in] pmeFlags PME flags
672 * \param[in] wcycle The wallcycle structure
674 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
679 gmx_wallcycle_t wcycle)
681 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
682 pme_gpu_launch_spread(pmedata, x, wcycle);
685 /*! \brief Launch the FFT and gather stages of PME GPU
687 * This function only implements setting the output forces (no accumulation).
689 * \param[in] pmedata The PME structure
690 * \param[in] wcycle The wallcycle structure
692 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
693 gmx_wallcycle_t wcycle)
695 pme_gpu_launch_complex_transforms(pmedata, wcycle);
696 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
700 * Polling wait for either of the PME or nonbonded GPU tasks.
702 * Instead of a static order in waiting for GPU tasks, this function
703 * polls checking which of the two tasks completes first, and does the
704 * associated force buffer reduction overlapped with the other task.
705 * By doing that, unlike static scheduling order, it can always overlap
706 * one of the reductions, regardless of the GPU task completion order.
708 * \param[in] nbv Nonbonded verlet structure
709 * \param[in,out] pmedata PME module data
710 * \param[in,out] force Force array to reduce task outputs into.
711 * \param[in,out] forceWithVirial Force and virial buffers
712 * \param[in,out] fshift Shift force output vector results are reduced into
713 * \param[in,out] enerd Energy data structure results are reduced into
714 * \param[in] flags Force flags
715 * \param[in] pmeFlags PME flags
716 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
717 * \param[in] wcycle The wallcycle structure
719 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
721 gmx::ArrayRefWithPadding<gmx::RVec> *force,
722 gmx::ForceWithVirial *forceWithVirial,
724 gmx_enerdata_t *enerd,
728 gmx_wallcycle_t wcycle)
730 bool isPmeGpuDone = false;
731 bool isNbGpuDone = false;
734 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
736 while (!isPmeGpuDone || !isNbGpuDone)
740 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
741 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
746 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
747 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
748 isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
750 Nbnxm::AtomLocality::Local,
752 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
753 fshift, completionType);
754 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
755 // To get the call count right, when the task finished we
756 // issue a start/stop.
757 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
758 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
761 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
762 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
764 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
765 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
772 * Launch the dynamic rolling pruning GPU task.
774 * We currently alternate local/non-local list pruning in odd-even steps
775 * (only pruning every second step without DD).
777 * \param[in] cr The communication record
778 * \param[in] nbv Nonbonded verlet structure
779 * \param[in] inputrec The input record
780 * \param[in] step The current MD step
782 static inline void launchGpuRollingPruning(const t_commrec *cr,
783 const nonbonded_verlet_t *nbv,
784 const t_inputrec *inputrec,
787 /* We should not launch the rolling pruning kernel at a search
788 * step or just before search steps, since that's useless.
789 * Without domain decomposition we prune at even steps.
790 * With domain decomposition we alternate local and non-local
791 * pruning at even and odd steps.
793 int numRollingParts = nbv->listParams->numRollingParts;
794 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
795 int stepWithCurrentList = nbnxnNumStepsWithPairlist(*nbv, Nbnxm::InteractionLocality::Local, step);
796 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
797 if (stepWithCurrentList > 0 &&
798 stepWithCurrentList < inputrec->nstlist - 1 &&
799 (stepIsEven || havePPDomainDecomposition(cr)))
801 Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
802 stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
807 static void do_force_cutsVERLET(FILE *fplog,
809 const gmx_multisim_t *ms,
810 const t_inputrec *inputrec,
812 gmx_enfrot *enforcedRotation,
815 gmx_wallcycle_t wcycle,
816 const gmx_localtop_t *top,
817 const gmx_groups_t * /* groups */,
818 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
820 gmx::ArrayRefWithPadding<gmx::RVec> force,
822 const t_mdatoms *mdatoms,
823 gmx_enerdata_t *enerd, t_fcdata *fcd,
827 gmx::PpForceWorkload *ppForceWorkload,
828 interaction_const_t *ic,
829 const gmx_vsite_t *vsite,
834 const DDBalanceRegionHandler &ddBalanceRegionHandler)
838 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
839 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
840 rvec vzero, box_diag;
841 float cycles_pme, cycles_wait_gpu;
842 nonbonded_verlet_t *nbv = fr->nbv;
844 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
845 bNS = ((flags & GMX_FORCE_NS) != 0);
846 bFillGrid = (bNS && bStateChanged);
847 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
848 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
849 bUseGPU = fr->nbv->useGpu();
850 bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
852 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
853 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
854 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
855 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
856 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
857 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
858 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
859 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
861 /* At a search step we need to start the first balancing region
862 * somewhere early inside the step after communication during domain
863 * decomposition (and not during the previous step as usual).
867 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
873 const int homenr = mdatoms->homenr;
875 clear_mat(vir_force);
877 if (DOMAINDECOMP(cr))
879 cg1 = cr->dd->globalAtomGroupIndices.size();
892 update_forcerec(fr, box);
894 if (inputrecNeedMutot(inputrec))
896 /* Calculate total (local) dipole moment in a temporary common array.
897 * This makes it possible to sum them over nodes faster.
899 calc_mu(start, homenr,
900 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
905 if (fr->ePBC != epbcNONE)
907 /* Compute shift vectors every step,
908 * because of pressure coupling or box deformation!
910 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
912 calc_shifts(box, fr->shift_vec);
917 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
918 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
920 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
922 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
926 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
927 fr->shift_vec, nbv->nbat);
930 if (!thisRankHasDuty(cr, DUTY_PME))
932 /* Send particle coordinates to the pme nodes.
933 * Since this is only implemented for domain decomposition
934 * and domain decomposition does not use the graph,
935 * we do not need to worry about shifting.
937 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
938 lambda[efptCOUL], lambda[efptVDW],
939 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
946 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
949 /* do gridding for pair search */
952 if (graph && bStateChanged)
954 /* Calculate intramolecular shift vectors to make molecules whole */
955 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
959 box_diag[XX] = box[XX][XX];
960 box_diag[YY] = box[YY][YY];
961 box_diag[ZZ] = box[ZZ][ZZ];
963 wallcycle_start(wcycle, ewcNS);
964 if (!DOMAINDECOMP(cr))
966 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
967 nbnxn_put_on_grid(nbv, box,
969 nullptr, 0, mdatoms->homenr, -1,
970 fr->cginfo, x.unpaddedArrayRef(),
972 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
976 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
977 nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd),
978 fr->cginfo, x.unpaddedArrayRef());
979 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
982 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
984 wallcycle_stop(wcycle, ewcNS);
987 /* initialize the GPU atom data and copy shift vector */
990 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
991 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
995 Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
998 Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1000 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1002 if (bNS && fr->gpuBonded)
1004 /* Now we put all atoms on the grid, we can assign bonded
1005 * interactions to the GPU, where the grid order is
1006 * needed. Also the xq, f and fshift device buffers have
1007 * been reallocated if needed, so the bonded code can
1008 * learn about them. */
1009 // TODO the xq, f, and fshift buffers are now shared
1010 // resources, so they should be maintained by a
1011 // higher-level object than the nb module.
1012 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1014 Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1015 Nbnxm::gpu_get_f(nbv->gpu_nbv),
1016 Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1017 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1020 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1023 /* do local pair search */
1026 wallcycle_start_nocount(wcycle, ewcNS);
1027 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1028 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1029 nbv->constructPairlist(Nbnxm::InteractionLocality::Local,
1030 &top->excls, step, nrnb);
1031 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1032 wallcycle_stop(wcycle, ewcNS);
1036 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1037 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1043 ddBalanceRegionHandler.openBeforeForceComputationGpu();
1045 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1047 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1048 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1049 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1051 // bonded work not split into separate local and non-local, so with DD
1052 // we can only launch the kernel after non-local coordinates have been received.
1053 if (ppForceWorkload->haveGpuBondedWork && !havePPDomainDecomposition(cr))
1055 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1056 fr->gpuBonded->launchKernels(fr, flags, box);
1057 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1060 /* launch local nonbonded work on GPU */
1061 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1062 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1063 step, nrnb, wcycle);
1064 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1065 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1070 // In PME GPU and mixed mode we launch FFT / gather after the
1071 // X copy/transform to allow overlap as well as after the GPU NB
1072 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1073 // the nonbonded kernel.
1074 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1077 /* Communicate coordinates and sum dipole if necessary +
1078 do non-local pair search */
1079 if (havePPDomainDecomposition(cr))
1083 wallcycle_start_nocount(wcycle, ewcNS);
1084 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1085 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1086 nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal,
1087 &top->excls, step, nrnb);
1088 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1089 wallcycle_stop(wcycle, ewcNS);
1093 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1095 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1096 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1102 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1104 /* launch non-local nonbonded tasks on GPU */
1105 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1106 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1107 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1109 if (ppForceWorkload->haveGpuBondedWork)
1111 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1112 fr->gpuBonded->launchKernels(fr, flags, box);
1113 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1116 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1117 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1118 step, nrnb, wcycle);
1119 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1121 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1127 /* launch D2H copy-back F */
1128 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1129 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1130 if (havePPDomainDecomposition(cr))
1132 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1133 flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1135 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1136 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1137 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1139 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1141 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1142 fr->gpuBonded->launchEnergyTransfer();
1143 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1145 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1148 if (bStateChanged && inputrecNeedMutot(inputrec))
1152 gmx_sumd(2*DIM, mu, cr);
1154 ddBalanceRegionHandler.reopenRegionCpu();
1157 for (i = 0; i < 2; i++)
1159 for (j = 0; j < DIM; j++)
1161 fr->mu_tot[i][j] = mu[i*DIM + j];
1165 if (fr->efep == efepNO)
1167 copy_rvec(fr->mu_tot[0], mu_tot);
1171 for (j = 0; j < DIM; j++)
1174 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1175 lambda[efptCOUL]*fr->mu_tot[1][j];
1179 /* Reset energies */
1180 reset_enerdata(enerd);
1181 clear_rvecs(SHIFTS, fr->fshift);
1183 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1185 wallcycle_start(wcycle, ewcPPDURINGPME);
1186 dd_force_flop_start(cr->dd, nrnb);
1191 wallcycle_start(wcycle, ewcROT);
1192 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1193 wallcycle_stop(wcycle, ewcROT);
1196 /* Temporary solution until all routines take PaddedRVecVector */
1197 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1199 /* Start the force cycle counter.
1200 * Note that a different counter is used for dynamic load balancing.
1202 wallcycle_start(wcycle, ewcFORCE);
1204 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1207 /* If we need to compute the virial, we might need a separate
1208 * force buffer for algorithms for which the virial is calculated
1209 * directly, such as PME.
1211 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1213 forceRef = *fr->forceBufferForDirectVirialContributions;
1215 /* TODO: update comment
1216 * We only compute forces on local atoms. Note that vsites can
1217 * spread to non-local atoms, but that part of the buffer is
1218 * cleared separately in the vsite spreading code.
1220 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1222 /* Clear the short- and long-range forces */
1223 clear_rvecs_omp(fr->natoms_force_constr, f);
1226 /* forceWithVirial uses the local atom range only */
1227 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1229 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1231 clear_pull_forces(inputrec->pull_work);
1234 /* We calculate the non-bonded forces, when done on the CPU, here.
1235 * We do this before calling do_force_lowlevel, because in that
1236 * function, the listed forces are calculated before PME, which
1237 * does communication. With this order, non-bonded and listed
1238 * force calculation imbalance can be balanced out by the domain
1239 * decomposition load balancing.
1244 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1245 step, nrnb, wcycle);
1248 if (fr->efep != efepNO)
1250 /* Calculate the local and non-local free energy interactions here.
1251 * Happens here on the CPU both with and without GPU.
1253 wallcycle_sub_start(wcycle, ewcsNONBONDED);
1254 nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
1255 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1256 inputrec->fepvals, lambda,
1257 enerd, flags, nrnb);
1259 if (havePPDomainDecomposition(cr))
1261 nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
1262 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1263 inputrec->fepvals, lambda,
1264 enerd, flags, nrnb);
1266 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
1271 if (havePPDomainDecomposition(cr))
1273 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1274 step, nrnb, wcycle);
1277 const Nbnxm::InteractionLocality iloc =
1278 (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1280 /* Add all the non-bonded force to the normal force array.
1281 * This can be split into a local and a non-local part when overlapping
1282 * communication with calculation with domain decomposition.
1284 wallcycle_stop(wcycle, ewcFORCE);
1286 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1288 wallcycle_start_nocount(wcycle, ewcFORCE);
1290 /* if there are multiple fshift output buffers reduce them */
1291 if ((flags & GMX_FORCE_VIRIAL) &&
1292 nbv->pairlistSet(iloc).nnbl > 1)
1294 /* This is not in a subcounter because it takes a
1295 negligible and constant-sized amount of time */
1296 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1301 /* update QMMMrec, if necessary */
1304 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1307 /* Compute the bonded and non-bonded energies and optionally forces */
1308 do_force_lowlevel(fr, inputrec, &(top->idef),
1309 cr, ms, nrnb, wcycle, mdatoms,
1310 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1311 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1313 &cycles_pme, ddBalanceRegionHandler);
1315 wallcycle_stop(wcycle, ewcFORCE);
1317 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1319 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1320 flags, &forceWithVirial, enerd,
1325 /* wait for non-local forces (or calculate in emulation mode) */
1326 if (havePPDomainDecomposition(cr))
1330 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1331 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1332 flags, Nbnxm::AtomLocality::NonLocal,
1333 ppForceWorkload->haveGpuBondedWork,
1334 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1336 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1340 wallcycle_start_nocount(wcycle, ewcFORCE);
1341 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1342 step, nrnb, wcycle);
1343 wallcycle_stop(wcycle, ewcFORCE);
1346 /* skip the reduction if there was no non-local work to do */
1347 if (!nbv->pairlistSet(Nbnxm::InteractionLocality::NonLocal).nblGpu[0]->sci.empty())
1349 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1350 nbv->nbat, f, wcycle);
1355 if (havePPDomainDecomposition(cr))
1357 /* We are done with the CPU compute.
1358 * We will now communicate the non-local forces.
1359 * If we use a GPU this will overlap with GPU work, so in that case
1360 * we do not close the DD force balancing region here.
1362 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1366 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1370 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1371 // an alternating wait/reduction scheme.
1372 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1373 if (alternateGpuWait)
1375 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1378 if (!alternateGpuWait && useGpuPme)
1380 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1383 /* Wait for local GPU NB outputs on the non-alternating wait path */
1384 if (!alternateGpuWait && bUseGPU)
1386 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1387 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1388 * but even with a step of 0.1 ms the difference is less than 1%
1391 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1393 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1394 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1395 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1396 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1398 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1400 if (ddBalanceRegionHandler.useBalancingRegion())
1402 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1403 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1405 /* We measured few cycles, it could be that the kernel
1406 * and transfer finished earlier and there was no actual
1407 * wait time, only API call overhead.
1408 * Then the actual time could be anywhere between 0 and
1409 * cycles_wait_est. We will use half of cycles_wait_est.
1411 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1413 ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1417 if (fr->nbv->emulateGpu())
1419 // NOTE: emulation kernel is not included in the balancing region,
1420 // but emulation mode does not target performance anyway
1421 wallcycle_start_nocount(wcycle, ewcFORCE);
1422 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1423 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1424 step, nrnb, wcycle);
1425 wallcycle_stop(wcycle, ewcFORCE);
1430 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1435 /* now clear the GPU outputs while we finish the step on the CPU */
1436 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1437 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1438 Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1440 /* Is dynamic pair-list pruning activated? */
1441 if (nbv->listParams->useDynamicPruning)
1443 launchGpuRollingPruning(cr, nbv, inputrec, step);
1445 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1446 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1449 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1451 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1452 // in principle this should be included in the DD balancing region,
1453 // but generally it is infrequent so we'll omit it for the sake of
1455 fr->gpuBonded->accumulateEnergyTerms(enerd);
1456 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1458 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1459 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1460 fr->gpuBonded->clearEnergies();
1461 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1462 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1465 /* Do the nonbonded GPU (or emulation) force buffer reduction
1466 * on the non-alternating path. */
1467 if (bUseOrEmulGPU && !alternateGpuWait)
1469 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1470 nbv->nbat, f, wcycle);
1472 if (DOMAINDECOMP(cr))
1474 dd_force_flop_stop(cr->dd, nrnb);
1479 /* If we have NoVirSum forces, but we do not calculate the virial,
1480 * we sum fr->f_novirsum=f later.
1482 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1484 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1485 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1488 if (flags & GMX_FORCE_VIRIAL)
1490 /* Calculation of the virial must be done after vsites! */
1491 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1492 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1496 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1498 /* In case of node-splitting, the PP nodes receive the long-range
1499 * forces, virial and energy from the PME nodes here.
1501 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1506 post_process_forces(cr, step, nrnb, wcycle,
1507 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1508 vir_force, mdatoms, graph, fr, vsite,
1512 if (flags & GMX_FORCE_ENERGY)
1514 /* Sum the potential energy terms from group contributions */
1515 sum_epot(&(enerd->grpp), enerd->term);
1517 if (!EI_TPI(inputrec->eI))
1519 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1524 static void do_force_cutsGROUP(FILE *fplog,
1525 const t_commrec *cr,
1526 const gmx_multisim_t *ms,
1527 const t_inputrec *inputrec,
1529 gmx_enfrot *enforcedRotation,
1532 gmx_wallcycle_t wcycle,
1533 gmx_localtop_t *top,
1534 const gmx_groups_t *groups,
1535 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1537 gmx::ArrayRefWithPadding<gmx::RVec> force,
1539 const t_mdatoms *mdatoms,
1540 gmx_enerdata_t *enerd,
1545 const gmx_vsite_t *vsite,
1550 const DDBalanceRegionHandler &ddBalanceRegionHandler)
1554 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1558 const int start = 0;
1559 const int homenr = mdatoms->homenr;
1561 clear_mat(vir_force);
1564 if (DOMAINDECOMP(cr))
1566 cg1 = cr->dd->globalAtomGroupIndices.size();
1577 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1578 bNS = ((flags & GMX_FORCE_NS) != 0);
1579 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1580 bFillGrid = (bNS && bStateChanged);
1581 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1582 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1586 update_forcerec(fr, box);
1588 if (inputrecNeedMutot(inputrec))
1590 /* Calculate total (local) dipole moment in a temporary common array.
1591 * This makes it possible to sum them over nodes faster.
1593 calc_mu(start, homenr,
1594 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1599 if (fr->ePBC != epbcNONE)
1601 /* Compute shift vectors every step,
1602 * because of pressure coupling or box deformation!
1604 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1606 calc_shifts(box, fr->shift_vec);
1611 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1612 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1613 inc_nrnb(nrnb, eNR_CGCM, homenr);
1614 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1616 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1618 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1623 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1624 inc_nrnb(nrnb, eNR_CGCM, homenr);
1627 if (bCalcCGCM && gmx_debug_at)
1629 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1633 if (!thisRankHasDuty(cr, DUTY_PME))
1635 /* Send particle coordinates to the pme nodes.
1636 * Since this is only implemented for domain decomposition
1637 * and domain decomposition does not use the graph,
1638 * we do not need to worry about shifting.
1640 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1641 lambda[efptCOUL], lambda[efptVDW],
1642 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1645 #endif /* GMX_MPI */
1647 /* Communicate coordinates and sum dipole if necessary */
1648 if (DOMAINDECOMP(cr))
1650 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1652 /* No GPU support, no move_x overlap, so reopen the balance region here */
1653 ddBalanceRegionHandler.reopenRegionCpu();
1656 if (inputrecNeedMutot(inputrec))
1662 gmx_sumd(2*DIM, mu, cr);
1664 ddBalanceRegionHandler.reopenRegionCpu();
1666 for (i = 0; i < 2; i++)
1668 for (j = 0; j < DIM; j++)
1670 fr->mu_tot[i][j] = mu[i*DIM + j];
1674 if (fr->efep == efepNO)
1676 copy_rvec(fr->mu_tot[0], mu_tot);
1680 for (j = 0; j < DIM; j++)
1683 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1688 /* Reset energies */
1689 reset_enerdata(enerd);
1690 clear_rvecs(SHIFTS, fr->fshift);
1694 wallcycle_start(wcycle, ewcNS);
1696 if (graph && bStateChanged)
1698 /* Calculate intramolecular shift vectors to make molecules whole */
1699 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1702 /* Do the actual neighbour searching */
1704 groups, top, mdatoms,
1705 cr, nrnb, bFillGrid);
1707 wallcycle_stop(wcycle, ewcNS);
1710 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1712 wallcycle_start(wcycle, ewcPPDURINGPME);
1713 dd_force_flop_start(cr->dd, nrnb);
1718 wallcycle_start(wcycle, ewcROT);
1719 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1720 wallcycle_stop(wcycle, ewcROT);
1723 /* Temporary solution until all routines take PaddedRVecVector */
1724 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1726 /* Start the force cycle counter.
1727 * Note that a different counter is used for dynamic load balancing.
1729 wallcycle_start(wcycle, ewcFORCE);
1731 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1734 /* If we need to compute the virial, we might need a separate
1735 * force buffer for algorithms for which the virial is calculated
1736 * separately, such as PME.
1738 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1740 forceRef = *fr->forceBufferForDirectVirialContributions;
1742 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1745 /* Clear the short- and long-range forces */
1746 clear_rvecs(fr->natoms_force_constr, f);
1749 /* forceWithVirial might need the full force atom range */
1750 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1752 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1754 clear_pull_forces(inputrec->pull_work);
1757 /* update QMMMrec, if necessary */
1760 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1763 /* Compute the bonded and non-bonded energies and optionally forces */
1764 do_force_lowlevel(fr, inputrec, &(top->idef),
1765 cr, ms, nrnb, wcycle, mdatoms,
1766 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1767 box, inputrec->fepvals, lambda,
1768 graph, &(top->excls), fr->mu_tot,
1770 &cycles_pme, ddBalanceRegionHandler);
1772 wallcycle_stop(wcycle, ewcFORCE);
1774 if (DOMAINDECOMP(cr))
1776 dd_force_flop_stop(cr->dd, nrnb);
1778 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1781 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1783 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1784 flags, &forceWithVirial, enerd,
1789 /* Communicate the forces */
1790 if (DOMAINDECOMP(cr))
1792 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1793 /* Do we need to communicate the separate force array
1794 * for terms that do not contribute to the single sum virial?
1795 * Position restraints and electric fields do not introduce
1796 * inter-cg forces, only full electrostatics methods do.
1797 * When we do not calculate the virial, fr->f_novirsum = f,
1798 * so we have already communicated these forces.
1800 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1801 (flags & GMX_FORCE_VIRIAL))
1803 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1807 /* If we have NoVirSum forces, but we do not calculate the virial,
1808 * we sum fr->f_novirsum=f later.
1810 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1812 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1813 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1816 if (flags & GMX_FORCE_VIRIAL)
1818 /* Calculation of the virial must be done after vsites! */
1819 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1820 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1824 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1826 /* In case of node-splitting, the PP nodes receive the long-range
1827 * forces, virial and energy from the PME nodes here.
1829 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1834 post_process_forces(cr, step, nrnb, wcycle,
1835 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1836 vir_force, mdatoms, graph, fr, vsite,
1840 if (flags & GMX_FORCE_ENERGY)
1842 /* Sum the potential energy terms from group contributions */
1843 sum_epot(&(enerd->grpp), enerd->term);
1845 if (!EI_TPI(inputrec->eI))
1847 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1853 void do_force(FILE *fplog,
1854 const t_commrec *cr,
1855 const gmx_multisim_t *ms,
1856 const t_inputrec *inputrec,
1858 gmx_enfrot *enforcedRotation,
1861 gmx_wallcycle_t wcycle,
1862 gmx_localtop_t *top,
1863 const gmx_groups_t *groups,
1865 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
1867 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
1869 const t_mdatoms *mdatoms,
1870 gmx_enerdata_t *enerd,
1872 gmx::ArrayRef<real> lambda,
1875 gmx::PpForceWorkload *ppForceWorkload,
1876 const gmx_vsite_t *vsite,
1881 const DDBalanceRegionHandler &ddBalanceRegionHandler)
1883 /* modify force flag if not doing nonbonded */
1884 if (!fr->bNonbonded)
1886 flags &= ~GMX_FORCE_NONBONDED;
1889 switch (inputrec->cutoff_scheme)
1892 do_force_cutsVERLET(fplog, cr, ms, inputrec,
1893 awh, enforcedRotation, step, nrnb, wcycle,
1900 lambda.data(), graph,
1907 ddBalanceRegionHandler);
1910 do_force_cutsGROUP(fplog, cr, ms, inputrec,
1911 awh, enforcedRotation, step, nrnb, wcycle,
1918 lambda.data(), graph,
1922 ddBalanceRegionHandler);
1925 gmx_incons("Invalid cut-off scheme passed!");
1928 /* In case we don't have constraints and are using GPUs, the next balancing
1929 * region starts here.
1930 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1931 * virial calculation and COM pulling, is not thus not included in
1932 * the balance timing, which is ok as most tasks do communication.
1934 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
1938 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
1939 const t_inputrec *ir, const t_mdatoms *md,
1942 int i, m, start, end;
1944 real dt = ir->delta_t;
1948 /* We need to allocate one element extra, since we might use
1949 * (unaligned) 4-wide SIMD loads to access rvec entries.
1951 snew(savex, state->natoms + 1);
1958 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1959 start, md->homenr, end);
1961 /* Do a first constrain to reset particles... */
1962 step = ir->init_step;
1965 char buf[STEPSTRSIZE];
1966 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1967 gmx_step_str(step, buf));
1971 /* constrain the current position */
1972 constr->apply(TRUE, FALSE,
1974 state->x.rvec_array(), state->x.rvec_array(), nullptr,
1976 state->lambda[efptBONDED], &dvdl_dum,
1977 nullptr, nullptr, gmx::ConstraintVariable::Positions);
1980 /* constrain the inital velocity, and save it */
1981 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1982 constr->apply(TRUE, FALSE,
1984 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
1986 state->lambda[efptBONDED], &dvdl_dum,
1987 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
1989 /* constrain the inital velocities at t-dt/2 */
1990 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1992 auto x = makeArrayRef(state->x).subArray(start, end);
1993 auto v = makeArrayRef(state->v).subArray(start, end);
1994 for (i = start; (i < end); i++)
1996 for (m = 0; (m < DIM); m++)
1998 /* Reverse the velocity */
2000 /* Store the position at t-dt in buf */
2001 savex[i][m] = x[i][m] + dt*v[i][m];
2004 /* Shake the positions at t=-dt with the positions at t=0
2005 * as reference coordinates.
2009 char buf[STEPSTRSIZE];
2010 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2011 gmx_step_str(step, buf));
2014 constr->apply(TRUE, FALSE,
2016 state->x.rvec_array(), savex, nullptr,
2018 state->lambda[efptBONDED], &dvdl_dum,
2019 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2021 for (i = start; i < end; i++)
2023 for (m = 0; m < DIM; m++)
2025 /* Re-reverse the velocities */
2035 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2036 double *enerout, double *virout)
2038 double enersum, virsum;
2039 double invscale, invscale2, invscale3;
2040 double r, ea, eb, ec, pa, pb, pc, pd;
2045 invscale = 1.0/scale;
2046 invscale2 = invscale*invscale;
2047 invscale3 = invscale*invscale2;
2049 /* Following summation derived from cubic spline definition,
2050 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2051 * the cubic spline. We first calculate the negative of the
2052 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2053 * add the more standard, abrupt cutoff correction to that result,
2054 * yielding the long-range correction for a switched function. We
2055 * perform both the pressure and energy loops at the same time for
2056 * simplicity, as the computational cost is low. */
2060 /* Since the dispersion table has been scaled down a factor
2061 * 6.0 and the repulsion a factor 12.0 to compensate for the
2062 * c6/c12 parameters inside nbfp[] being scaled up (to save
2063 * flops in kernels), we need to correct for this.
2074 for (ri = rstart; ri < rend; ++ri)
2078 eb = 2.0*invscale2*r;
2082 pb = 3.0*invscale2*r;
2083 pc = 3.0*invscale*r*r;
2086 /* this "8" is from the packing in the vdwtab array - perhaps
2087 should be defined? */
2089 offset = 8*ri + offstart;
2090 y0 = vdwtab[offset];
2091 f = vdwtab[offset+1];
2092 g = vdwtab[offset+2];
2093 h = vdwtab[offset+3];
2095 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2096 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2098 *enerout = 4.0*M_PI*enersum*tabfactor;
2099 *virout = 4.0*M_PI*virsum*tabfactor;
2102 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2104 double eners[2], virs[2], enersum, virsum;
2105 double r0, rc3, rc9;
2107 real scale, *vdwtab;
2109 fr->enershiftsix = 0;
2110 fr->enershifttwelve = 0;
2111 fr->enerdiffsix = 0;
2112 fr->enerdifftwelve = 0;
2114 fr->virdifftwelve = 0;
2116 const interaction_const_t *ic = fr->ic;
2118 if (eDispCorr != edispcNO)
2120 for (i = 0; i < 2; i++)
2125 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2126 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2127 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2128 (ic->vdwtype == evdwSHIFT) ||
2129 (ic->vdwtype == evdwSWITCH))
2131 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2132 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2133 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2136 "With dispersion correction rvdw-switch can not be zero "
2137 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2140 /* TODO This code depends on the logic in tables.c that
2141 constructs the table layout, which should be made
2142 explicit in future cleanup. */
2143 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2144 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2145 scale = fr->dispersionCorrectionTable->scale;
2146 vdwtab = fr->dispersionCorrectionTable->data;
2148 /* Round the cut-offs to exact table values for precision */
2149 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2150 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2152 /* The code below has some support for handling force-switching, i.e.
2153 * when the force (instead of potential) is switched over a limited
2154 * region. This leads to a constant shift in the potential inside the
2155 * switching region, which we can handle by adding a constant energy
2156 * term in the force-switch case just like when we do potential-shift.
2158 * For now this is not enabled, but to keep the functionality in the
2159 * code we check separately for switch and shift. When we do force-switch
2160 * the shifting point is rvdw_switch, while it is the cutoff when we
2161 * have a classical potential-shift.
2163 * For a pure potential-shift the potential has a constant shift
2164 * all the way out to the cutoff, and that is it. For other forms
2165 * we need to calculate the constant shift up to the point where we
2166 * start modifying the potential.
2168 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2174 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2175 (ic->vdwtype == evdwSHIFT))
2177 /* Determine the constant energy shift below rvdw_switch.
2178 * Table has a scale factor since we have scaled it down to compensate
2179 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2181 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2182 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2184 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2186 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2187 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2190 /* Add the constant part from 0 to rvdw_switch.
2191 * This integration from 0 to rvdw_switch overcounts the number
2192 * of interactions by 1, as it also counts the self interaction.
2193 * We will correct for this later.
2195 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2196 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2198 /* Calculate the contribution in the range [r0,r1] where we
2199 * modify the potential. For a pure potential-shift modifier we will
2200 * have ri0==ri1, and there will not be any contribution here.
2202 for (i = 0; i < 2; i++)
2206 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2207 eners[i] -= enersum;
2211 /* Alright: Above we compensated by REMOVING the parts outside r0
2212 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2214 * Regardless of whether r0 is the point where we start switching,
2215 * or the cutoff where we calculated the constant shift, we include
2216 * all the parts we are missing out to infinity from r0 by
2217 * calculating the analytical dispersion correction.
2219 eners[0] += -4.0*M_PI/(3.0*rc3);
2220 eners[1] += 4.0*M_PI/(9.0*rc9);
2221 virs[0] += 8.0*M_PI/rc3;
2222 virs[1] += -16.0*M_PI/(3.0*rc9);
2224 else if (ic->vdwtype == evdwCUT ||
2225 EVDW_PME(ic->vdwtype) ||
2226 ic->vdwtype == evdwUSER)
2228 if (ic->vdwtype == evdwUSER && fplog)
2231 "WARNING: using dispersion correction with user tables\n");
2234 /* Note that with LJ-PME, the dispersion correction is multiplied
2235 * by the difference between the actual C6 and the value of C6
2236 * that would produce the combination rule.
2237 * This means the normal energy and virial difference formulas
2241 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2243 /* Contribution beyond the cut-off */
2244 eners[0] += -4.0*M_PI/(3.0*rc3);
2245 eners[1] += 4.0*M_PI/(9.0*rc9);
2246 if (ic->vdw_modifier == eintmodPOTSHIFT)
2248 /* Contribution within the cut-off */
2249 eners[0] += -4.0*M_PI/(3.0*rc3);
2250 eners[1] += 4.0*M_PI/(3.0*rc9);
2252 /* Contribution beyond the cut-off */
2253 virs[0] += 8.0*M_PI/rc3;
2254 virs[1] += -16.0*M_PI/(3.0*rc9);
2259 "Dispersion correction is not implemented for vdw-type = %s",
2260 evdw_names[ic->vdwtype]);
2263 /* When we deprecate the group kernels the code below can go too */
2264 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2266 /* Calculate self-interaction coefficient (assuming that
2267 * the reciprocal-space contribution is constant in the
2268 * region that contributes to the self-interaction).
2270 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2272 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2273 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2276 fr->enerdiffsix = eners[0];
2277 fr->enerdifftwelve = eners[1];
2278 /* The 0.5 is due to the Gromacs definition of the virial */
2279 fr->virdiffsix = 0.5*virs[0];
2280 fr->virdifftwelve = 0.5*virs[1];
2284 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2285 const matrix box, real lambda, tensor pres, tensor virial,
2286 real *prescorr, real *enercorr, real *dvdlcorr)
2288 gmx_bool bCorrAll, bCorrPres;
2289 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2299 if (ir->eDispCorr != edispcNO)
2301 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2302 ir->eDispCorr == edispcAllEnerPres);
2303 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2304 ir->eDispCorr == edispcAllEnerPres);
2306 invvol = 1/det(box);
2309 /* Only correct for the interactions with the inserted molecule */
2310 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2315 dens = fr->numAtomsForDispersionCorrection*invvol;
2316 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2319 if (ir->efep == efepNO)
2321 avcsix = fr->avcsix[0];
2322 avctwelve = fr->avctwelve[0];
2326 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2327 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2330 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2331 *enercorr += avcsix*enerdiff;
2333 if (ir->efep != efepNO)
2335 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2339 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2340 *enercorr += avctwelve*enerdiff;
2341 if (fr->efep != efepNO)
2343 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2349 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2350 if (ir->eDispCorr == edispcAllEnerPres)
2352 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2354 /* The factor 2 is because of the Gromacs virial definition */
2355 spres = -2.0*invvol*svir*PRESFAC;
2357 for (m = 0; m < DIM; m++)
2359 virial[m][m] += svir;
2360 pres[m][m] += spres;
2365 /* Can't currently control when it prints, for now, just print when degugging */
2370 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2376 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2377 *enercorr, spres, svir);
2381 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2385 if (fr->efep != efepNO)
2387 *dvdlcorr += dvdlambda;
2392 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2393 const gmx_mtop_t *mtop, rvec x[],
2399 if (bFirst && fplog)
2401 fprintf(fplog, "Removing pbc first time\n");
2406 for (const gmx_molblock_t &molb : mtop->molblock)
2408 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2409 if (moltype.atoms.nr == 1 ||
2410 (!bFirst && moltype.cgs.nr == 1))
2412 /* Just one atom or charge group in the molecule, no PBC required */
2413 as += molb.nmol*moltype.atoms.nr;
2417 mk_graph_moltype(moltype, graph);
2419 for (mol = 0; mol < molb.nmol; mol++)
2421 mk_mshift(fplog, graph, ePBC, box, x+as);
2423 shift_self(graph, box, x+as);
2424 /* The molecule is whole now.
2425 * We don't need the second mk_mshift call as in do_pbc_first,
2426 * since we no longer need this graph.
2429 as += moltype.atoms.nr;
2437 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2438 const gmx_mtop_t *mtop, rvec x[])
2440 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2443 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2444 const gmx_mtop_t *mtop, rvec x[])
2446 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2449 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2452 nth = gmx_omp_nthreads_get(emntDefault);
2454 #pragma omp parallel for num_threads(nth) schedule(static)
2455 for (t = 0; t < nth; t++)
2459 size_t natoms = x.size();
2460 size_t offset = (natoms*t )/nth;
2461 size_t len = (natoms*(t + 1))/nth - offset;
2462 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2464 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2468 // TODO This can be cleaned up a lot, and move back to runner.cpp
2469 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2470 const t_inputrec *inputrec,
2471 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2472 gmx_walltime_accounting_t walltime_accounting,
2473 nonbonded_verlet_t *nbv,
2474 const gmx_pme_t *pme,
2475 gmx_bool bWriteStat)
2477 t_nrnb *nrnb_tot = nullptr;
2479 double nbfs = 0, mflop = 0;
2480 double elapsed_time,
2481 elapsed_time_over_all_ranks,
2482 elapsed_time_over_all_threads,
2483 elapsed_time_over_all_threads_over_all_ranks;
2484 /* Control whether it is valid to print a report. Only the
2485 simulation master may print, but it should not do so if the run
2486 terminated e.g. before a scheduled reset step. This is
2487 complicated by the fact that PME ranks are unaware of the
2488 reason why they were sent a pmerecvqxFINISH. To avoid
2489 communication deadlocks, we always do the communication for the
2490 report, even if we've decided not to write the report, because
2491 how long it takes to finish the run is not important when we've
2492 decided not to report on the simulation performance.
2494 Further, we only report performance for dynamical integrators,
2495 because those are the only ones for which we plan to
2496 consider doing any optimizations. */
2497 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2499 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2501 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2502 printReport = false;
2509 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2510 cr->mpi_comm_mysim);
2518 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2519 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2523 /* reduce elapsed_time over all MPI ranks in the current simulation */
2524 MPI_Allreduce(&elapsed_time,
2525 &elapsed_time_over_all_ranks,
2526 1, MPI_DOUBLE, MPI_SUM,
2527 cr->mpi_comm_mysim);
2528 elapsed_time_over_all_ranks /= cr->nnodes;
2529 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2530 * current simulation. */
2531 MPI_Allreduce(&elapsed_time_over_all_threads,
2532 &elapsed_time_over_all_threads_over_all_ranks,
2533 1, MPI_DOUBLE, MPI_SUM,
2534 cr->mpi_comm_mysim);
2539 elapsed_time_over_all_ranks = elapsed_time;
2540 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2545 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2552 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2554 print_dd_statistics(cr, inputrec, fplog);
2557 /* TODO Move the responsibility for any scaling by thread counts
2558 * to the code that handled the thread region, so that there's a
2559 * mechanism to keep cycle counting working during the transition
2560 * to task parallelism. */
2561 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2562 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2563 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2564 auto cycle_sum(wallcycle_sum(cr, wcycle));
2568 auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2569 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2570 if (pme_gpu_task_enabled(pme))
2572 pme_gpu_get_timings(pme, &pme_gpu_timings);
2574 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2575 elapsed_time_over_all_ranks,
2580 if (EI_DYNAMICS(inputrec->eI))
2582 delta_t = inputrec->delta_t;
2587 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2588 elapsed_time_over_all_ranks,
2589 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2590 delta_t, nbfs, mflop);
2594 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2595 elapsed_time_over_all_ranks,
2596 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2597 delta_t, nbfs, mflop);
2602 void initialize_lambdas(FILE *fplog,
2603 const t_inputrec &ir,
2606 gmx::ArrayRef<real> lambda,
2609 /* TODO: Clean up initialization of fep_state and lambda in
2610 t_state. This function works, but could probably use a logic
2611 rewrite to keep all the different types of efep straight. */
2613 if ((ir.efep == efepNO) && (!ir.bSimTemp))
2618 const t_lambda *fep = ir.fepvals;
2621 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2622 if checkpoint is set -- a kludge is in for now
2626 for (int i = 0; i < efptNR; i++)
2629 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2630 if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2632 thisLambda = fep->init_lambda;
2636 thisLambda = fep->all_lambda[i][fep->init_fep_state];
2640 lambda[i] = thisLambda;
2642 if (lam0 != nullptr)
2644 lam0[i] = thisLambda;
2649 /* need to rescale control temperatures to match current state */
2650 for (int i = 0; i < ir.opts.ngtc; i++)
2652 if (ir.opts.ref_t[i] > 0)
2654 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2659 /* Send to the log the information on the current lambdas */
2660 if (fplog != nullptr)
2662 fprintf(fplog, "Initial vector of lambda components:[ ");
2663 for (const auto &l : lambda)
2665 fprintf(fplog, "%10.4f ", l);
2667 fprintf(fplog, "]\n");