2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
125 void print_time(FILE *out,
126 gmx_walltime_accounting_t walltime_accounting,
128 const t_inputrec *ir,
132 double dt, elapsed_seconds, time_per_step;
141 fputs(gmx::int64ToString(step).c_str(), out);
144 if ((step >= ir->nstlist))
146 double seconds_since_epoch = gmx_gettime();
147 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
149 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
155 finish = static_cast<time_t>(seconds_since_epoch + dt);
156 auto timebuf = gmx_ctime_r(&finish);
157 timebuf.erase(timebuf.find_first_of('\n'));
158 fputs(", will finish ", out);
159 fputs(timebuf.c_str(), out);
163 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
168 fprintf(out, " performance: %.1f ns/day ",
169 ir->delta_t/1000*24*60*60/time_per_step);
178 GMX_UNUSED_VALUE(cr);
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
192 time_t temp_time = static_cast<time_t>(the_time);
194 auto timebuf = gmx_ctime_r(&temp_time);
196 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
199 void print_start(FILE *fplog, const t_commrec *cr,
200 gmx_walltime_accounting_t walltime_accounting,
205 sprintf(buf, "Started %s", name);
206 print_date_and_time(fplog, cr->nodeid, buf,
207 walltime_accounting_get_start_time_stamp(walltime_accounting));
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
212 const int end = forceToAdd.size();
214 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216 for (int i = 0; i < end; i++)
218 rvec_inc(f[i], forceToAdd[i]);
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223 tensor vir_part, const t_graph *graph, const matrix box,
224 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
226 /* The short-range virial from surrounding boxes */
227 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
230 /* Calculate partial virial, for local atoms only, based on short range.
231 * Total virial is computed in global_stat, called from do_md
233 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
238 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
242 static void pull_potential_wrapper(const t_commrec *cr,
243 const t_inputrec *ir,
244 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245 gmx::ForceWithVirial *force,
246 const t_mdatoms *mdatoms,
247 gmx_enerdata_t *enerd,
250 gmx_wallcycle_t wcycle)
255 /* Calculate the center of mass forces, this requires communication,
256 * which is why pull_potential is called close to other communication.
258 wallcycle_start(wcycle, ewcPULLPOT);
259 set_pbc(&pbc, ir->ePBC, box);
261 enerd->term[F_COM_PULL] +=
262 pull_potential(ir->pull_work, mdatoms, &pbc,
263 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265 wallcycle_stop(wcycle, ewcPULLPOT);
268 static void pme_receive_force_ener(const t_commrec *cr,
269 gmx::ForceWithVirial *forceWithVirial,
270 gmx_enerdata_t *enerd,
271 gmx_wallcycle_t wcycle)
273 real e_q, e_lj, dvdl_q, dvdl_lj;
274 float cycles_ppdpme, cycles_seppme;
276 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
279 /* In case of node-splitting, the PP nodes receive the long-range
280 * forces, virial and energy from the PME nodes here.
282 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
285 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
287 enerd->term[F_COUL_RECIP] += e_q;
288 enerd->term[F_LJ_RECIP] += e_lj;
289 enerd->dvdl_lin[efptCOUL] += dvdl_q;
290 enerd->dvdl_lin[efptVDW] += dvdl_lj;
294 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
296 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
299 static void print_large_forces(FILE *fp,
307 real force2Tolerance = gmx::square(forceTolerance);
308 gmx::index numNonFinite = 0;
309 for (int i = 0; i < md->homenr; i++)
311 real force2 = norm2(f[i]);
312 bool nonFinite = !std::isfinite(force2);
313 if (force2 >= force2Tolerance || nonFinite)
315 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
317 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
324 if (numNonFinite > 0)
326 /* Note that with MPI this fatal call on one rank might interrupt
327 * the printing on other ranks. But we can only avoid that with
328 * an expensive MPI barrier that we would need at each step.
330 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
334 static void post_process_forces(const t_commrec *cr,
337 gmx_wallcycle_t wcycle,
338 const gmx_localtop_t *top,
342 gmx::ForceWithVirial *forceWithVirial,
344 const t_mdatoms *mdatoms,
345 const t_graph *graph,
346 const t_forcerec *fr,
347 const gmx_vsite_t *vsite,
350 if (fr->haveDirectVirialContributions)
352 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
356 /* Spread the mesh force on virtual sites to the other particles...
357 * This is parallellized. MPI communication is performed
358 * if the constructing atoms aren't local.
360 matrix virial = { { 0 } };
361 spread_vsite_f(vsite, x, fDirectVir, nullptr,
362 (flags & GMX_FORCE_VIRIAL) != 0, virial,
364 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365 forceWithVirial->addVirialContribution(virial);
368 if (flags & GMX_FORCE_VIRIAL)
370 /* Now add the forces, this is local */
371 sum_forces(f, forceWithVirial->force_);
373 /* Add the direct virial contributions */
374 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
379 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
384 if (fr->print_force >= 0)
386 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
390 static void do_nb_verlet(t_forcerec *fr,
391 const interaction_const_t *ic,
392 gmx_enerdata_t *enerd,
394 const Nbnxm::InteractionLocality ilocality,
398 gmx_wallcycle_t wcycle)
400 if (!(flags & GMX_FORCE_NONBONDED))
402 /* skip non-bonded calculation */
406 nonbonded_verlet_t *nbv = fr->nbv;
408 /* GPU kernel launch overhead is already timed separately */
409 if (fr->cutoff_scheme != ecutsVERLET)
411 gmx_incons("Invalid cut-off scheme passed!");
416 /* When dynamic pair-list pruning is requested, we need to prune
417 * at nstlistPrune steps.
419 if (nbv->listParams->useDynamicPruning &&
420 nbnxnIsDynamicPairlistPruningStep(*nbv, ilocality, step))
422 /* Prune the pair-list beyond fr->ic->rlistPrune using
423 * the current coordinates of the atoms.
425 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
426 NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
427 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
430 wallcycle_sub_start(wcycle, ewcsNONBONDED);
433 NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
437 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
441 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
445 const t_mdatoms *mdatoms,
448 gmx_enerdata_t *enerd,
451 gmx_wallcycle_t wcycle)
454 nb_kernel_data_t kernel_data;
456 real dvdl_nb[efptNR];
461 /* Add short-range interactions */
462 donb_flags |= GMX_NONBONDED_DO_SR;
464 /* Currently all group scheme kernels always calculate (shift-)forces */
465 if (flags & GMX_FORCE_FORCES)
467 donb_flags |= GMX_NONBONDED_DO_FORCE;
469 if (flags & GMX_FORCE_VIRIAL)
471 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
473 if (flags & GMX_FORCE_ENERGY)
475 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
478 kernel_data.flags = donb_flags;
479 kernel_data.lambda = lambda;
480 kernel_data.dvdl = dvdl_nb;
482 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
483 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
485 /* reset free energy components */
486 for (i = 0; i < efptNR; i++)
491 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
493 wallcycle_sub_start(wcycle, ewcsNONBONDED);
494 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
495 for (th = 0; th < nbl_lists->nnbl; th++)
499 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
500 x, f, fr, mdatoms, &kernel_data, nrnb);
502 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
505 if (fepvals->sc_alpha != 0)
507 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
508 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
512 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
513 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
516 /* If we do foreign lambda and we have soft-core interactions
517 * we have to recalculate the (non-linear) energies contributions.
519 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
521 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
522 kernel_data.lambda = lam_i;
523 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
524 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
525 /* Note that we add to kernel_data.dvdl, but ignore the result */
527 for (i = 0; i < enerd->n_lambda; i++)
529 for (j = 0; j < efptNR; j++)
531 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
533 reset_foreign_enerdata(enerd);
534 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
535 for (th = 0; th < nbl_lists->nnbl; th++)
539 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
540 x, f, fr, mdatoms, &kernel_data, nrnb);
542 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
545 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
546 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
550 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
553 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
555 return nbv != nullptr && nbv->useGpu();
558 static inline void clear_rvecs_omp(int n, rvec v[])
560 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
562 /* Note that we would like to avoid this conditional by putting it
563 * into the omp pragma instead, but then we still take the full
564 * omp parallel for overhead (at least with gcc5).
568 for (int i = 0; i < n; i++)
575 #pragma omp parallel for num_threads(nth) schedule(static)
576 for (int i = 0; i < n; i++)
583 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
585 * \param groupOptions Group options, containing T-coupling options
587 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
589 real nrdfCoupled = 0;
590 real nrdfUncoupled = 0;
591 real kineticEnergy = 0;
592 for (int g = 0; g < groupOptions.ngtc; g++)
594 if (groupOptions.tau_t[g] >= 0)
596 nrdfCoupled += groupOptions.nrdf[g];
597 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
601 nrdfUncoupled += groupOptions.nrdf[g];
605 /* This conditional with > also catches nrdf=0 */
606 if (nrdfCoupled > nrdfUncoupled)
608 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
616 /*! \brief This routine checks that the potential energy is finite.
618 * Always checks that the potential energy is finite. If step equals
619 * inputrec.init_step also checks that the magnitude of the potential energy
620 * is reasonable. Terminates with a fatal error when a check fails.
621 * Note that passing this check does not guarantee finite forces,
622 * since those use slightly different arithmetics. But in most cases
623 * there is just a narrow coordinate range where forces are not finite
624 * and energies are finite.
626 * \param[in] step The step number, used for checking and printing
627 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
628 * \param[in] inputrec The input record
630 static void checkPotentialEnergyValidity(int64_t step,
631 const gmx_enerdata_t &enerd,
632 const t_inputrec &inputrec)
634 /* Threshold valid for comparing absolute potential energy against
635 * the kinetic energy. Normally one should not consider absolute
636 * potential energy values, but with a factor of one million
637 * we should never get false positives.
639 constexpr real c_thresholdFactor = 1e6;
641 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
642 real averageKineticEnergy = 0;
643 /* We only check for large potential energy at the initial step,
644 * because that is by far the most likely step for this too occur
645 * and because computing the average kinetic energy is not free.
646 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
647 * before they become NaN.
649 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
651 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
654 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
655 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
657 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
660 energyIsNotFinite ? "not finite" : "extremely high",
662 enerd.term[F_COUL_SR],
663 energyIsNotFinite ? "non-finite" : "very high",
664 energyIsNotFinite ? " or Nan" : "");
668 /*! \brief Compute forces and/or energies for special algorithms
670 * The intention is to collect all calls to algorithms that compute
671 * forces on local atoms only and that do not contribute to the local
672 * virial sum (but add their virial contribution separately).
673 * Eventually these should likely all become ForceProviders.
674 * Within this function the intention is to have algorithms that do
675 * global communication at the end, so global barriers within the MD loop
676 * are as close together as possible.
678 * \param[in] fplog The log file
679 * \param[in] cr The communication record
680 * \param[in] inputrec The input record
681 * \param[in] awh The Awh module (nullptr if none in use).
682 * \param[in] enforcedRotation Enforced rotation module.
683 * \param[in] step The current MD step
684 * \param[in] t The current time
685 * \param[in,out] wcycle Wallcycle accounting struct
686 * \param[in,out] forceProviders Pointer to a list of force providers
687 * \param[in] box The unit cell
688 * \param[in] x The coordinates
689 * \param[in] mdatoms Per atom properties
690 * \param[in] lambda Array of free-energy lambda values
691 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
692 * \param[in,out] forceWithVirial Force and virial buffers
693 * \param[in,out] enerd Energy buffer
694 * \param[in,out] ed Essential dynamics pointer
695 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
697 * \todo Remove bNS, which is used incorrectly.
698 * \todo Convert all other algorithms called here to ForceProviders.
701 computeSpecialForces(FILE *fplog,
703 const t_inputrec *inputrec,
705 gmx_enfrot *enforcedRotation,
708 gmx_wallcycle_t wcycle,
709 ForceProviders *forceProviders,
711 gmx::ArrayRef<const gmx::RVec> x,
712 const t_mdatoms *mdatoms,
715 gmx::ForceWithVirial *forceWithVirial,
716 gmx_enerdata_t *enerd,
720 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
722 /* NOTE: Currently all ForceProviders only provide forces.
723 * When they also provide energies, remove this conditional.
727 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
728 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
730 /* Collect forces from modules */
731 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
734 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
736 pull_potential_wrapper(cr, inputrec, box, x,
738 mdatoms, enerd, lambda, t,
743 enerd->term[F_COM_PULL] +=
744 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
746 t, step, wcycle, fplog);
750 rvec *f = as_rvec_array(forceWithVirial->force_.data());
752 /* Add the forces from enforced rotation potentials (if any) */
755 wallcycle_start(wcycle, ewcROTadd);
756 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
757 wallcycle_stop(wcycle, ewcROTadd);
762 /* Note that since init_edsam() is called after the initialization
763 * of forcerec, edsam doesn't request the noVirSum force buffer.
764 * Thus if no other algorithm (e.g. PME) requires it, the forces
765 * here will contribute to the virial.
767 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
770 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
771 if (inputrec->bIMD && computeForces)
773 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
777 /*! \brief Launch the prepare_step and spread stages of PME GPU.
779 * \param[in] pmedata The PME structure
780 * \param[in] box The box matrix
781 * \param[in] x Coordinate array
782 * \param[in] flags Force flags
783 * \param[in] pmeFlags PME flags
784 * \param[in] wcycle The wallcycle structure
786 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
791 gmx_wallcycle_t wcycle)
793 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
794 pme_gpu_launch_spread(pmedata, x, wcycle);
797 /*! \brief Launch the FFT and gather stages of PME GPU
799 * This function only implements setting the output forces (no accumulation).
801 * \param[in] pmedata The PME structure
802 * \param[in] wcycle The wallcycle structure
804 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
805 gmx_wallcycle_t wcycle)
807 pme_gpu_launch_complex_transforms(pmedata, wcycle);
808 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
812 * Polling wait for either of the PME or nonbonded GPU tasks.
814 * Instead of a static order in waiting for GPU tasks, this function
815 * polls checking which of the two tasks completes first, and does the
816 * associated force buffer reduction overlapped with the other task.
817 * By doing that, unlike static scheduling order, it can always overlap
818 * one of the reductions, regardless of the GPU task completion order.
820 * \param[in] nbv Nonbonded verlet structure
821 * \param[in,out] pmedata PME module data
822 * \param[in,out] force Force array to reduce task outputs into.
823 * \param[in,out] forceWithVirial Force and virial buffers
824 * \param[in,out] fshift Shift force output vector results are reduced into
825 * \param[in,out] enerd Energy data structure results are reduced into
826 * \param[in] flags Force flags
827 * \param[in] pmeFlags PME flags
828 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
829 * \param[in] wcycle The wallcycle structure
831 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
833 gmx::ArrayRefWithPadding<gmx::RVec> *force,
834 gmx::ForceWithVirial *forceWithVirial,
836 gmx_enerdata_t *enerd,
840 gmx_wallcycle_t wcycle)
842 bool isPmeGpuDone = false;
843 bool isNbGpuDone = false;
846 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
848 while (!isPmeGpuDone || !isNbGpuDone)
852 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
853 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
858 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
859 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
860 isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
862 Nbnxm::AtomLocality::Local,
864 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
865 fshift, completionType);
866 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
867 // To get the call count right, when the task finished we
868 // issue a start/stop.
869 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
870 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
873 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
874 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
876 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
877 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
884 * Launch the dynamic rolling pruning GPU task.
886 * We currently alternate local/non-local list pruning in odd-even steps
887 * (only pruning every second step without DD).
889 * \param[in] cr The communication record
890 * \param[in] nbv Nonbonded verlet structure
891 * \param[in] inputrec The input record
892 * \param[in] step The current MD step
894 static inline void launchGpuRollingPruning(const t_commrec *cr,
895 const nonbonded_verlet_t *nbv,
896 const t_inputrec *inputrec,
899 /* We should not launch the rolling pruning kernel at a search
900 * step or just before search steps, since that's useless.
901 * Without domain decomposition we prune at even steps.
902 * With domain decomposition we alternate local and non-local
903 * pruning at even and odd steps.
905 int numRollingParts = nbv->listParams->numRollingParts;
906 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
907 int stepWithCurrentList = nbnxnNumStepsWithPairlist(*nbv, Nbnxm::InteractionLocality::Local, step);
908 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
909 if (stepWithCurrentList > 0 &&
910 stepWithCurrentList < inputrec->nstlist - 1 &&
911 (stepIsEven || havePPDomainDecomposition(cr)))
913 Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
914 stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
919 static void do_force_cutsVERLET(FILE *fplog,
921 const gmx_multisim_t *ms,
922 const t_inputrec *inputrec,
924 gmx_enfrot *enforcedRotation,
927 gmx_wallcycle_t wcycle,
928 const gmx_localtop_t *top,
929 const gmx_groups_t * /* groups */,
930 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
932 gmx::ArrayRefWithPadding<gmx::RVec> force,
934 const t_mdatoms *mdatoms,
935 gmx_enerdata_t *enerd, t_fcdata *fcd,
939 gmx::PpForceWorkload *ppForceWorkload,
940 interaction_const_t *ic,
941 const gmx_vsite_t *vsite,
946 const DDBalanceRegionHandler &ddBalanceRegionHandler)
950 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
951 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
952 rvec vzero, box_diag;
953 float cycles_pme, cycles_wait_gpu;
954 nonbonded_verlet_t *nbv = fr->nbv;
956 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
957 bNS = ((flags & GMX_FORCE_NS) != 0);
958 bFillGrid = (bNS && bStateChanged);
959 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
960 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
961 bUseGPU = fr->nbv->useGpu();
962 bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
964 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
965 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
966 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
967 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
968 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
969 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
970 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
971 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
973 /* At a search step we need to start the first balancing region
974 * somewhere early inside the step after communication during domain
975 * decomposition (and not during the previous step as usual).
979 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
985 const int homenr = mdatoms->homenr;
987 clear_mat(vir_force);
989 if (DOMAINDECOMP(cr))
991 cg1 = cr->dd->globalAtomGroupIndices.size();
1004 update_forcerec(fr, box);
1006 if (inputrecNeedMutot(inputrec))
1008 /* Calculate total (local) dipole moment in a temporary common array.
1009 * This makes it possible to sum them over nodes faster.
1011 calc_mu(start, homenr,
1012 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1017 if (fr->ePBC != epbcNONE)
1019 /* Compute shift vectors every step,
1020 * because of pressure coupling or box deformation!
1022 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1024 calc_shifts(box, fr->shift_vec);
1029 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1030 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1032 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1034 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1038 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1039 fr->shift_vec, nbv->nbat);
1042 if (!thisRankHasDuty(cr, DUTY_PME))
1044 /* Send particle coordinates to the pme nodes.
1045 * Since this is only implemented for domain decomposition
1046 * and domain decomposition does not use the graph,
1047 * we do not need to worry about shifting.
1049 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1050 lambda[efptCOUL], lambda[efptVDW],
1051 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1054 #endif /* GMX_MPI */
1058 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1061 /* do gridding for pair search */
1064 if (graph && bStateChanged)
1066 /* Calculate intramolecular shift vectors to make molecules whole */
1067 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1071 box_diag[XX] = box[XX][XX];
1072 box_diag[YY] = box[YY][YY];
1073 box_diag[ZZ] = box[ZZ][ZZ];
1075 wallcycle_start(wcycle, ewcNS);
1076 if (!DOMAINDECOMP(cr))
1078 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1079 nbnxn_put_on_grid(nbv, box,
1081 nullptr, 0, mdatoms->homenr, -1,
1082 fr->cginfo, x.unpaddedArrayRef(),
1084 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1088 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1089 nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd),
1090 fr->cginfo, x.unpaddedArrayRef());
1091 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1094 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1096 wallcycle_stop(wcycle, ewcNS);
1099 /* initialize the GPU atom data and copy shift vector */
1102 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1103 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1107 Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1110 Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1112 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1114 if (bNS && fr->gpuBonded)
1116 /* Now we put all atoms on the grid, we can assign bonded
1117 * interactions to the GPU, where the grid order is
1118 * needed. Also the xq, f and fshift device buffers have
1119 * been reallocated if needed, so the bonded code can
1120 * learn about them. */
1121 // TODO the xq, f, and fshift buffers are now shared
1122 // resources, so they should be maintained by a
1123 // higher-level object than the nb module.
1124 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1126 Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1127 Nbnxm::gpu_get_f(nbv->gpu_nbv),
1128 Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1129 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1132 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1135 /* do local pair search */
1138 wallcycle_start_nocount(wcycle, ewcNS);
1139 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1140 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1141 nbnxn_make_pairlist(nbv, Nbnxm::InteractionLocality::Local,
1142 &top->excls, step, nrnb);
1143 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1144 wallcycle_stop(wcycle, ewcNS);
1148 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1149 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1155 ddBalanceRegionHandler.openBeforeForceComputationGpu();
1157 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1159 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1160 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1161 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1163 // bonded work not split into separate local and non-local, so with DD
1164 // we can only launch the kernel after non-local coordinates have been received.
1165 if (ppForceWorkload->haveGpuBondedWork && !havePPDomainDecomposition(cr))
1167 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1168 fr->gpuBonded->launchKernels(fr, flags, box);
1169 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1172 /* launch local nonbonded work on GPU */
1173 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1174 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1175 step, nrnb, wcycle);
1176 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1177 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1182 // In PME GPU and mixed mode we launch FFT / gather after the
1183 // X copy/transform to allow overlap as well as after the GPU NB
1184 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1185 // the nonbonded kernel.
1186 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1189 /* Communicate coordinates and sum dipole if necessary +
1190 do non-local pair search */
1191 if (havePPDomainDecomposition(cr))
1195 wallcycle_start_nocount(wcycle, ewcNS);
1196 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1197 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1198 nbnxn_make_pairlist(nbv, Nbnxm::InteractionLocality::NonLocal,
1199 &top->excls, step, nrnb);
1200 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1201 wallcycle_stop(wcycle, ewcNS);
1205 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1207 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1208 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1214 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1216 /* launch non-local nonbonded tasks on GPU */
1217 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1218 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1219 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1221 if (ppForceWorkload->haveGpuBondedWork)
1223 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1224 fr->gpuBonded->launchKernels(fr, flags, box);
1225 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1228 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1229 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1230 step, nrnb, wcycle);
1231 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1233 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1239 /* launch D2H copy-back F */
1240 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1241 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1242 if (havePPDomainDecomposition(cr))
1244 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1245 flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1247 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1248 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1249 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1251 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1253 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1254 fr->gpuBonded->launchEnergyTransfer();
1255 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1257 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1260 if (bStateChanged && inputrecNeedMutot(inputrec))
1264 gmx_sumd(2*DIM, mu, cr);
1266 ddBalanceRegionHandler.reopenRegionCpu();
1269 for (i = 0; i < 2; i++)
1271 for (j = 0; j < DIM; j++)
1273 fr->mu_tot[i][j] = mu[i*DIM + j];
1277 if (fr->efep == efepNO)
1279 copy_rvec(fr->mu_tot[0], mu_tot);
1283 for (j = 0; j < DIM; j++)
1286 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1287 lambda[efptCOUL]*fr->mu_tot[1][j];
1291 /* Reset energies */
1292 reset_enerdata(enerd);
1293 clear_rvecs(SHIFTS, fr->fshift);
1295 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1297 wallcycle_start(wcycle, ewcPPDURINGPME);
1298 dd_force_flop_start(cr->dd, nrnb);
1303 wallcycle_start(wcycle, ewcROT);
1304 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1305 wallcycle_stop(wcycle, ewcROT);
1308 /* Temporary solution until all routines take PaddedRVecVector */
1309 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1311 /* Start the force cycle counter.
1312 * Note that a different counter is used for dynamic load balancing.
1314 wallcycle_start(wcycle, ewcFORCE);
1316 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1319 /* If we need to compute the virial, we might need a separate
1320 * force buffer for algorithms for which the virial is calculated
1321 * directly, such as PME.
1323 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1325 forceRef = *fr->forceBufferForDirectVirialContributions;
1327 /* TODO: update comment
1328 * We only compute forces on local atoms. Note that vsites can
1329 * spread to non-local atoms, but that part of the buffer is
1330 * cleared separately in the vsite spreading code.
1332 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1334 /* Clear the short- and long-range forces */
1335 clear_rvecs_omp(fr->natoms_force_constr, f);
1338 /* forceWithVirial uses the local atom range only */
1339 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1341 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1343 clear_pull_forces(inputrec->pull_work);
1346 /* We calculate the non-bonded forces, when done on the CPU, here.
1347 * We do this before calling do_force_lowlevel, because in that
1348 * function, the listed forces are calculated before PME, which
1349 * does communication. With this order, non-bonded and listed
1350 * force calculation imbalance can be balanced out by the domain
1351 * decomposition load balancing.
1356 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1357 step, nrnb, wcycle);
1360 if (fr->efep != efepNO)
1362 /* Calculate the local and non-local free energy interactions here.
1363 * Happens here on the CPU both with and without GPU.
1365 if (fr->nbv->pairlistSets[Nbnxm::InteractionLocality::Local].nbl_fep[0]->nrj > 0)
1367 do_nb_verlet_fep(&fr->nbv->pairlistSets[Nbnxm::InteractionLocality::Local],
1368 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1369 inputrec->fepvals, lambda,
1370 enerd, flags, nrnb, wcycle);
1373 if (DOMAINDECOMP(cr) &&
1374 fr->nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal].nbl_fep[0]->nrj > 0)
1376 do_nb_verlet_fep(&fr->nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal],
1377 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1378 inputrec->fepvals, lambda,
1379 enerd, flags, nrnb, wcycle);
1385 if (havePPDomainDecomposition(cr))
1387 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1388 step, nrnb, wcycle);
1391 const Nbnxm::InteractionLocality iloc =
1392 (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1394 /* Add all the non-bonded force to the normal force array.
1395 * This can be split into a local and a non-local part when overlapping
1396 * communication with calculation with domain decomposition.
1398 wallcycle_stop(wcycle, ewcFORCE);
1400 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1402 wallcycle_start_nocount(wcycle, ewcFORCE);
1404 /* if there are multiple fshift output buffers reduce them */
1405 if ((flags & GMX_FORCE_VIRIAL) &&
1406 nbv->pairlistSets[iloc].nnbl > 1)
1408 /* This is not in a subcounter because it takes a
1409 negligible and constant-sized amount of time */
1410 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1415 /* update QMMMrec, if necessary */
1418 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1421 /* Compute the bonded and non-bonded energies and optionally forces */
1422 do_force_lowlevel(fr, inputrec, &(top->idef),
1423 cr, ms, nrnb, wcycle, mdatoms,
1424 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1425 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1427 &cycles_pme, ddBalanceRegionHandler);
1429 wallcycle_stop(wcycle, ewcFORCE);
1431 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1433 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1434 flags, &forceWithVirial, enerd,
1439 /* wait for non-local forces (or calculate in emulation mode) */
1440 if (havePPDomainDecomposition(cr))
1444 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1445 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1446 flags, Nbnxm::AtomLocality::NonLocal,
1447 ppForceWorkload->haveGpuBondedWork,
1448 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1450 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1454 wallcycle_start_nocount(wcycle, ewcFORCE);
1455 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1456 step, nrnb, wcycle);
1457 wallcycle_stop(wcycle, ewcFORCE);
1460 /* skip the reduction if there was no non-local work to do */
1461 if (!nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal].nblGpu[0]->sci.empty())
1463 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1464 nbv->nbat, f, wcycle);
1469 if (havePPDomainDecomposition(cr))
1471 /* We are done with the CPU compute.
1472 * We will now communicate the non-local forces.
1473 * If we use a GPU this will overlap with GPU work, so in that case
1474 * we do not close the DD force balancing region here.
1476 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1480 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1484 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1485 // an alternating wait/reduction scheme.
1486 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1487 if (alternateGpuWait)
1489 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1492 if (!alternateGpuWait && useGpuPme)
1494 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1497 /* Wait for local GPU NB outputs on the non-alternating wait path */
1498 if (!alternateGpuWait && bUseGPU)
1500 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1501 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1502 * but even with a step of 0.1 ms the difference is less than 1%
1505 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1507 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1508 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1509 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1510 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1512 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1514 if (ddBalanceRegionHandler.useBalancingRegion())
1516 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1517 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1519 /* We measured few cycles, it could be that the kernel
1520 * and transfer finished earlier and there was no actual
1521 * wait time, only API call overhead.
1522 * Then the actual time could be anywhere between 0 and
1523 * cycles_wait_est. We will use half of cycles_wait_est.
1525 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1527 ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1531 if (fr->nbv->emulateGpu())
1533 // NOTE: emulation kernel is not included in the balancing region,
1534 // but emulation mode does not target performance anyway
1535 wallcycle_start_nocount(wcycle, ewcFORCE);
1536 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1537 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1538 step, nrnb, wcycle);
1539 wallcycle_stop(wcycle, ewcFORCE);
1544 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1549 /* now clear the GPU outputs while we finish the step on the CPU */
1550 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1551 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1552 Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1554 /* Is dynamic pair-list pruning activated? */
1555 if (nbv->listParams->useDynamicPruning)
1557 launchGpuRollingPruning(cr, nbv, inputrec, step);
1559 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1560 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1563 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1565 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1566 // in principle this should be included in the DD balancing region,
1567 // but generally it is infrequent so we'll omit it for the sake of
1569 fr->gpuBonded->accumulateEnergyTerms(enerd);
1570 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1572 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1573 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1574 fr->gpuBonded->clearEnergies();
1575 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1576 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1579 /* Do the nonbonded GPU (or emulation) force buffer reduction
1580 * on the non-alternating path. */
1581 if (bUseOrEmulGPU && !alternateGpuWait)
1583 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1584 nbv->nbat, f, wcycle);
1586 if (DOMAINDECOMP(cr))
1588 dd_force_flop_stop(cr->dd, nrnb);
1593 /* If we have NoVirSum forces, but we do not calculate the virial,
1594 * we sum fr->f_novirsum=f later.
1596 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1598 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1599 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1602 if (flags & GMX_FORCE_VIRIAL)
1604 /* Calculation of the virial must be done after vsites! */
1605 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1606 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1610 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1612 /* In case of node-splitting, the PP nodes receive the long-range
1613 * forces, virial and energy from the PME nodes here.
1615 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1620 post_process_forces(cr, step, nrnb, wcycle,
1621 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1622 vir_force, mdatoms, graph, fr, vsite,
1626 if (flags & GMX_FORCE_ENERGY)
1628 /* Sum the potential energy terms from group contributions */
1629 sum_epot(&(enerd->grpp), enerd->term);
1631 if (!EI_TPI(inputrec->eI))
1633 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1638 static void do_force_cutsGROUP(FILE *fplog,
1639 const t_commrec *cr,
1640 const gmx_multisim_t *ms,
1641 const t_inputrec *inputrec,
1643 gmx_enfrot *enforcedRotation,
1646 gmx_wallcycle_t wcycle,
1647 gmx_localtop_t *top,
1648 const gmx_groups_t *groups,
1649 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1651 gmx::ArrayRefWithPadding<gmx::RVec> force,
1653 const t_mdatoms *mdatoms,
1654 gmx_enerdata_t *enerd,
1659 const gmx_vsite_t *vsite,
1664 const DDBalanceRegionHandler &ddBalanceRegionHandler)
1668 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1672 const int start = 0;
1673 const int homenr = mdatoms->homenr;
1675 clear_mat(vir_force);
1678 if (DOMAINDECOMP(cr))
1680 cg1 = cr->dd->globalAtomGroupIndices.size();
1691 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1692 bNS = ((flags & GMX_FORCE_NS) != 0);
1693 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1694 bFillGrid = (bNS && bStateChanged);
1695 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1696 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1700 update_forcerec(fr, box);
1702 if (inputrecNeedMutot(inputrec))
1704 /* Calculate total (local) dipole moment in a temporary common array.
1705 * This makes it possible to sum them over nodes faster.
1707 calc_mu(start, homenr,
1708 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1713 if (fr->ePBC != epbcNONE)
1715 /* Compute shift vectors every step,
1716 * because of pressure coupling or box deformation!
1718 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1720 calc_shifts(box, fr->shift_vec);
1725 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1726 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1727 inc_nrnb(nrnb, eNR_CGCM, homenr);
1728 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1730 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1732 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1737 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1738 inc_nrnb(nrnb, eNR_CGCM, homenr);
1741 if (bCalcCGCM && gmx_debug_at)
1743 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1747 if (!thisRankHasDuty(cr, DUTY_PME))
1749 /* Send particle coordinates to the pme nodes.
1750 * Since this is only implemented for domain decomposition
1751 * and domain decomposition does not use the graph,
1752 * we do not need to worry about shifting.
1754 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1755 lambda[efptCOUL], lambda[efptVDW],
1756 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1759 #endif /* GMX_MPI */
1761 /* Communicate coordinates and sum dipole if necessary */
1762 if (DOMAINDECOMP(cr))
1764 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1766 /* No GPU support, no move_x overlap, so reopen the balance region here */
1767 ddBalanceRegionHandler.reopenRegionCpu();
1770 if (inputrecNeedMutot(inputrec))
1776 gmx_sumd(2*DIM, mu, cr);
1778 ddBalanceRegionHandler.reopenRegionCpu();
1780 for (i = 0; i < 2; i++)
1782 for (j = 0; j < DIM; j++)
1784 fr->mu_tot[i][j] = mu[i*DIM + j];
1788 if (fr->efep == efepNO)
1790 copy_rvec(fr->mu_tot[0], mu_tot);
1794 for (j = 0; j < DIM; j++)
1797 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1802 /* Reset energies */
1803 reset_enerdata(enerd);
1804 clear_rvecs(SHIFTS, fr->fshift);
1808 wallcycle_start(wcycle, ewcNS);
1810 if (graph && bStateChanged)
1812 /* Calculate intramolecular shift vectors to make molecules whole */
1813 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1816 /* Do the actual neighbour searching */
1818 groups, top, mdatoms,
1819 cr, nrnb, bFillGrid);
1821 wallcycle_stop(wcycle, ewcNS);
1824 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1826 wallcycle_start(wcycle, ewcPPDURINGPME);
1827 dd_force_flop_start(cr->dd, nrnb);
1832 wallcycle_start(wcycle, ewcROT);
1833 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1834 wallcycle_stop(wcycle, ewcROT);
1837 /* Temporary solution until all routines take PaddedRVecVector */
1838 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1840 /* Start the force cycle counter.
1841 * Note that a different counter is used for dynamic load balancing.
1843 wallcycle_start(wcycle, ewcFORCE);
1845 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1848 /* If we need to compute the virial, we might need a separate
1849 * force buffer for algorithms for which the virial is calculated
1850 * separately, such as PME.
1852 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1854 forceRef = *fr->forceBufferForDirectVirialContributions;
1856 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1859 /* Clear the short- and long-range forces */
1860 clear_rvecs(fr->natoms_force_constr, f);
1863 /* forceWithVirial might need the full force atom range */
1864 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1866 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1868 clear_pull_forces(inputrec->pull_work);
1871 /* update QMMMrec, if necessary */
1874 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1877 /* Compute the bonded and non-bonded energies and optionally forces */
1878 do_force_lowlevel(fr, inputrec, &(top->idef),
1879 cr, ms, nrnb, wcycle, mdatoms,
1880 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1881 box, inputrec->fepvals, lambda,
1882 graph, &(top->excls), fr->mu_tot,
1884 &cycles_pme, ddBalanceRegionHandler);
1886 wallcycle_stop(wcycle, ewcFORCE);
1888 if (DOMAINDECOMP(cr))
1890 dd_force_flop_stop(cr->dd, nrnb);
1892 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1895 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1897 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1898 flags, &forceWithVirial, enerd,
1903 /* Communicate the forces */
1904 if (DOMAINDECOMP(cr))
1906 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1907 /* Do we need to communicate the separate force array
1908 * for terms that do not contribute to the single sum virial?
1909 * Position restraints and electric fields do not introduce
1910 * inter-cg forces, only full electrostatics methods do.
1911 * When we do not calculate the virial, fr->f_novirsum = f,
1912 * so we have already communicated these forces.
1914 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1915 (flags & GMX_FORCE_VIRIAL))
1917 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1921 /* If we have NoVirSum forces, but we do not calculate the virial,
1922 * we sum fr->f_novirsum=f later.
1924 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1926 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1927 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1930 if (flags & GMX_FORCE_VIRIAL)
1932 /* Calculation of the virial must be done after vsites! */
1933 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1934 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1938 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1940 /* In case of node-splitting, the PP nodes receive the long-range
1941 * forces, virial and energy from the PME nodes here.
1943 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1948 post_process_forces(cr, step, nrnb, wcycle,
1949 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1950 vir_force, mdatoms, graph, fr, vsite,
1954 if (flags & GMX_FORCE_ENERGY)
1956 /* Sum the potential energy terms from group contributions */
1957 sum_epot(&(enerd->grpp), enerd->term);
1959 if (!EI_TPI(inputrec->eI))
1961 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1967 void do_force(FILE *fplog,
1968 const t_commrec *cr,
1969 const gmx_multisim_t *ms,
1970 const t_inputrec *inputrec,
1972 gmx_enfrot *enforcedRotation,
1975 gmx_wallcycle_t wcycle,
1976 gmx_localtop_t *top,
1977 const gmx_groups_t *groups,
1979 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
1981 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
1983 const t_mdatoms *mdatoms,
1984 gmx_enerdata_t *enerd,
1986 gmx::ArrayRef<real> lambda,
1989 gmx::PpForceWorkload *ppForceWorkload,
1990 const gmx_vsite_t *vsite,
1995 const DDBalanceRegionHandler &ddBalanceRegionHandler)
1997 /* modify force flag if not doing nonbonded */
1998 if (!fr->bNonbonded)
2000 flags &= ~GMX_FORCE_NONBONDED;
2003 switch (inputrec->cutoff_scheme)
2006 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2007 awh, enforcedRotation, step, nrnb, wcycle,
2014 lambda.data(), graph,
2021 ddBalanceRegionHandler);
2024 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2025 awh, enforcedRotation, step, nrnb, wcycle,
2032 lambda.data(), graph,
2036 ddBalanceRegionHandler);
2039 gmx_incons("Invalid cut-off scheme passed!");
2042 /* In case we don't have constraints and are using GPUs, the next balancing
2043 * region starts here.
2044 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2045 * virial calculation and COM pulling, is not thus not included in
2046 * the balance timing, which is ok as most tasks do communication.
2048 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2052 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2053 const t_inputrec *ir, const t_mdatoms *md,
2056 int i, m, start, end;
2058 real dt = ir->delta_t;
2062 /* We need to allocate one element extra, since we might use
2063 * (unaligned) 4-wide SIMD loads to access rvec entries.
2065 snew(savex, state->natoms + 1);
2072 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2073 start, md->homenr, end);
2075 /* Do a first constrain to reset particles... */
2076 step = ir->init_step;
2079 char buf[STEPSTRSIZE];
2080 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2081 gmx_step_str(step, buf));
2085 /* constrain the current position */
2086 constr->apply(TRUE, FALSE,
2088 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2090 state->lambda[efptBONDED], &dvdl_dum,
2091 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2094 /* constrain the inital velocity, and save it */
2095 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2096 constr->apply(TRUE, FALSE,
2098 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2100 state->lambda[efptBONDED], &dvdl_dum,
2101 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2103 /* constrain the inital velocities at t-dt/2 */
2104 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2106 auto x = makeArrayRef(state->x).subArray(start, end);
2107 auto v = makeArrayRef(state->v).subArray(start, end);
2108 for (i = start; (i < end); i++)
2110 for (m = 0; (m < DIM); m++)
2112 /* Reverse the velocity */
2114 /* Store the position at t-dt in buf */
2115 savex[i][m] = x[i][m] + dt*v[i][m];
2118 /* Shake the positions at t=-dt with the positions at t=0
2119 * as reference coordinates.
2123 char buf[STEPSTRSIZE];
2124 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2125 gmx_step_str(step, buf));
2128 constr->apply(TRUE, FALSE,
2130 state->x.rvec_array(), savex, nullptr,
2132 state->lambda[efptBONDED], &dvdl_dum,
2133 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2135 for (i = start; i < end; i++)
2137 for (m = 0; m < DIM; m++)
2139 /* Re-reverse the velocities */
2149 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2150 double *enerout, double *virout)
2152 double enersum, virsum;
2153 double invscale, invscale2, invscale3;
2154 double r, ea, eb, ec, pa, pb, pc, pd;
2159 invscale = 1.0/scale;
2160 invscale2 = invscale*invscale;
2161 invscale3 = invscale*invscale2;
2163 /* Following summation derived from cubic spline definition,
2164 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2165 * the cubic spline. We first calculate the negative of the
2166 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2167 * add the more standard, abrupt cutoff correction to that result,
2168 * yielding the long-range correction for a switched function. We
2169 * perform both the pressure and energy loops at the same time for
2170 * simplicity, as the computational cost is low. */
2174 /* Since the dispersion table has been scaled down a factor
2175 * 6.0 and the repulsion a factor 12.0 to compensate for the
2176 * c6/c12 parameters inside nbfp[] being scaled up (to save
2177 * flops in kernels), we need to correct for this.
2188 for (ri = rstart; ri < rend; ++ri)
2192 eb = 2.0*invscale2*r;
2196 pb = 3.0*invscale2*r;
2197 pc = 3.0*invscale*r*r;
2200 /* this "8" is from the packing in the vdwtab array - perhaps
2201 should be defined? */
2203 offset = 8*ri + offstart;
2204 y0 = vdwtab[offset];
2205 f = vdwtab[offset+1];
2206 g = vdwtab[offset+2];
2207 h = vdwtab[offset+3];
2209 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2210 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2212 *enerout = 4.0*M_PI*enersum*tabfactor;
2213 *virout = 4.0*M_PI*virsum*tabfactor;
2216 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2218 double eners[2], virs[2], enersum, virsum;
2219 double r0, rc3, rc9;
2221 real scale, *vdwtab;
2223 fr->enershiftsix = 0;
2224 fr->enershifttwelve = 0;
2225 fr->enerdiffsix = 0;
2226 fr->enerdifftwelve = 0;
2228 fr->virdifftwelve = 0;
2230 const interaction_const_t *ic = fr->ic;
2232 if (eDispCorr != edispcNO)
2234 for (i = 0; i < 2; i++)
2239 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2240 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2241 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2242 (ic->vdwtype == evdwSHIFT) ||
2243 (ic->vdwtype == evdwSWITCH))
2245 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2246 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2247 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2250 "With dispersion correction rvdw-switch can not be zero "
2251 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2254 /* TODO This code depends on the logic in tables.c that
2255 constructs the table layout, which should be made
2256 explicit in future cleanup. */
2257 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2258 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2259 scale = fr->dispersionCorrectionTable->scale;
2260 vdwtab = fr->dispersionCorrectionTable->data;
2262 /* Round the cut-offs to exact table values for precision */
2263 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2264 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2266 /* The code below has some support for handling force-switching, i.e.
2267 * when the force (instead of potential) is switched over a limited
2268 * region. This leads to a constant shift in the potential inside the
2269 * switching region, which we can handle by adding a constant energy
2270 * term in the force-switch case just like when we do potential-shift.
2272 * For now this is not enabled, but to keep the functionality in the
2273 * code we check separately for switch and shift. When we do force-switch
2274 * the shifting point is rvdw_switch, while it is the cutoff when we
2275 * have a classical potential-shift.
2277 * For a pure potential-shift the potential has a constant shift
2278 * all the way out to the cutoff, and that is it. For other forms
2279 * we need to calculate the constant shift up to the point where we
2280 * start modifying the potential.
2282 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2288 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2289 (ic->vdwtype == evdwSHIFT))
2291 /* Determine the constant energy shift below rvdw_switch.
2292 * Table has a scale factor since we have scaled it down to compensate
2293 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2295 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2296 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2298 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2300 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2301 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2304 /* Add the constant part from 0 to rvdw_switch.
2305 * This integration from 0 to rvdw_switch overcounts the number
2306 * of interactions by 1, as it also counts the self interaction.
2307 * We will correct for this later.
2309 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2310 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2312 /* Calculate the contribution in the range [r0,r1] where we
2313 * modify the potential. For a pure potential-shift modifier we will
2314 * have ri0==ri1, and there will not be any contribution here.
2316 for (i = 0; i < 2; i++)
2320 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2321 eners[i] -= enersum;
2325 /* Alright: Above we compensated by REMOVING the parts outside r0
2326 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2328 * Regardless of whether r0 is the point where we start switching,
2329 * or the cutoff where we calculated the constant shift, we include
2330 * all the parts we are missing out to infinity from r0 by
2331 * calculating the analytical dispersion correction.
2333 eners[0] += -4.0*M_PI/(3.0*rc3);
2334 eners[1] += 4.0*M_PI/(9.0*rc9);
2335 virs[0] += 8.0*M_PI/rc3;
2336 virs[1] += -16.0*M_PI/(3.0*rc9);
2338 else if (ic->vdwtype == evdwCUT ||
2339 EVDW_PME(ic->vdwtype) ||
2340 ic->vdwtype == evdwUSER)
2342 if (ic->vdwtype == evdwUSER && fplog)
2345 "WARNING: using dispersion correction with user tables\n");
2348 /* Note that with LJ-PME, the dispersion correction is multiplied
2349 * by the difference between the actual C6 and the value of C6
2350 * that would produce the combination rule.
2351 * This means the normal energy and virial difference formulas
2355 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2357 /* Contribution beyond the cut-off */
2358 eners[0] += -4.0*M_PI/(3.0*rc3);
2359 eners[1] += 4.0*M_PI/(9.0*rc9);
2360 if (ic->vdw_modifier == eintmodPOTSHIFT)
2362 /* Contribution within the cut-off */
2363 eners[0] += -4.0*M_PI/(3.0*rc3);
2364 eners[1] += 4.0*M_PI/(3.0*rc9);
2366 /* Contribution beyond the cut-off */
2367 virs[0] += 8.0*M_PI/rc3;
2368 virs[1] += -16.0*M_PI/(3.0*rc9);
2373 "Dispersion correction is not implemented for vdw-type = %s",
2374 evdw_names[ic->vdwtype]);
2377 /* When we deprecate the group kernels the code below can go too */
2378 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2380 /* Calculate self-interaction coefficient (assuming that
2381 * the reciprocal-space contribution is constant in the
2382 * region that contributes to the self-interaction).
2384 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2386 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2387 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2390 fr->enerdiffsix = eners[0];
2391 fr->enerdifftwelve = eners[1];
2392 /* The 0.5 is due to the Gromacs definition of the virial */
2393 fr->virdiffsix = 0.5*virs[0];
2394 fr->virdifftwelve = 0.5*virs[1];
2398 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2399 const matrix box, real lambda, tensor pres, tensor virial,
2400 real *prescorr, real *enercorr, real *dvdlcorr)
2402 gmx_bool bCorrAll, bCorrPres;
2403 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2413 if (ir->eDispCorr != edispcNO)
2415 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2416 ir->eDispCorr == edispcAllEnerPres);
2417 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2418 ir->eDispCorr == edispcAllEnerPres);
2420 invvol = 1/det(box);
2423 /* Only correct for the interactions with the inserted molecule */
2424 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2429 dens = fr->numAtomsForDispersionCorrection*invvol;
2430 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2433 if (ir->efep == efepNO)
2435 avcsix = fr->avcsix[0];
2436 avctwelve = fr->avctwelve[0];
2440 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2441 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2444 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2445 *enercorr += avcsix*enerdiff;
2447 if (ir->efep != efepNO)
2449 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2453 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2454 *enercorr += avctwelve*enerdiff;
2455 if (fr->efep != efepNO)
2457 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2463 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2464 if (ir->eDispCorr == edispcAllEnerPres)
2466 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2468 /* The factor 2 is because of the Gromacs virial definition */
2469 spres = -2.0*invvol*svir*PRESFAC;
2471 for (m = 0; m < DIM; m++)
2473 virial[m][m] += svir;
2474 pres[m][m] += spres;
2479 /* Can't currently control when it prints, for now, just print when degugging */
2484 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2490 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2491 *enercorr, spres, svir);
2495 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2499 if (fr->efep != efepNO)
2501 *dvdlcorr += dvdlambda;
2506 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2507 const gmx_mtop_t *mtop, rvec x[],
2513 if (bFirst && fplog)
2515 fprintf(fplog, "Removing pbc first time\n");
2520 for (const gmx_molblock_t &molb : mtop->molblock)
2522 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2523 if (moltype.atoms.nr == 1 ||
2524 (!bFirst && moltype.cgs.nr == 1))
2526 /* Just one atom or charge group in the molecule, no PBC required */
2527 as += molb.nmol*moltype.atoms.nr;
2531 mk_graph_moltype(moltype, graph);
2533 for (mol = 0; mol < molb.nmol; mol++)
2535 mk_mshift(fplog, graph, ePBC, box, x+as);
2537 shift_self(graph, box, x+as);
2538 /* The molecule is whole now.
2539 * We don't need the second mk_mshift call as in do_pbc_first,
2540 * since we no longer need this graph.
2543 as += moltype.atoms.nr;
2551 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2552 const gmx_mtop_t *mtop, rvec x[])
2554 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2557 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2558 const gmx_mtop_t *mtop, rvec x[])
2560 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2563 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2566 nth = gmx_omp_nthreads_get(emntDefault);
2568 #pragma omp parallel for num_threads(nth) schedule(static)
2569 for (t = 0; t < nth; t++)
2573 size_t natoms = x.size();
2574 size_t offset = (natoms*t )/nth;
2575 size_t len = (natoms*(t + 1))/nth - offset;
2576 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2578 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2582 // TODO This can be cleaned up a lot, and move back to runner.cpp
2583 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2584 const t_inputrec *inputrec,
2585 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2586 gmx_walltime_accounting_t walltime_accounting,
2587 nonbonded_verlet_t *nbv,
2588 const gmx_pme_t *pme,
2589 gmx_bool bWriteStat)
2591 t_nrnb *nrnb_tot = nullptr;
2593 double nbfs = 0, mflop = 0;
2594 double elapsed_time,
2595 elapsed_time_over_all_ranks,
2596 elapsed_time_over_all_threads,
2597 elapsed_time_over_all_threads_over_all_ranks;
2598 /* Control whether it is valid to print a report. Only the
2599 simulation master may print, but it should not do so if the run
2600 terminated e.g. before a scheduled reset step. This is
2601 complicated by the fact that PME ranks are unaware of the
2602 reason why they were sent a pmerecvqxFINISH. To avoid
2603 communication deadlocks, we always do the communication for the
2604 report, even if we've decided not to write the report, because
2605 how long it takes to finish the run is not important when we've
2606 decided not to report on the simulation performance.
2608 Further, we only report performance for dynamical integrators,
2609 because those are the only ones for which we plan to
2610 consider doing any optimizations. */
2611 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2613 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2615 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2616 printReport = false;
2623 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2624 cr->mpi_comm_mysim);
2632 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2633 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2637 /* reduce elapsed_time over all MPI ranks in the current simulation */
2638 MPI_Allreduce(&elapsed_time,
2639 &elapsed_time_over_all_ranks,
2640 1, MPI_DOUBLE, MPI_SUM,
2641 cr->mpi_comm_mysim);
2642 elapsed_time_over_all_ranks /= cr->nnodes;
2643 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2644 * current simulation. */
2645 MPI_Allreduce(&elapsed_time_over_all_threads,
2646 &elapsed_time_over_all_threads_over_all_ranks,
2647 1, MPI_DOUBLE, MPI_SUM,
2648 cr->mpi_comm_mysim);
2653 elapsed_time_over_all_ranks = elapsed_time;
2654 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2659 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2666 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2668 print_dd_statistics(cr, inputrec, fplog);
2671 /* TODO Move the responsibility for any scaling by thread counts
2672 * to the code that handled the thread region, so that there's a
2673 * mechanism to keep cycle counting working during the transition
2674 * to task parallelism. */
2675 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2676 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2677 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2678 auto cycle_sum(wallcycle_sum(cr, wcycle));
2682 auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2683 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2684 if (pme_gpu_task_enabled(pme))
2686 pme_gpu_get_timings(pme, &pme_gpu_timings);
2688 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2689 elapsed_time_over_all_ranks,
2694 if (EI_DYNAMICS(inputrec->eI))
2696 delta_t = inputrec->delta_t;
2701 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2702 elapsed_time_over_all_ranks,
2703 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2704 delta_t, nbfs, mflop);
2708 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2709 elapsed_time_over_all_ranks,
2710 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2711 delta_t, nbfs, mflop);
2716 void initialize_lambdas(FILE *fplog,
2717 const t_inputrec &ir,
2720 gmx::ArrayRef<real> lambda,
2723 /* TODO: Clean up initialization of fep_state and lambda in
2724 t_state. This function works, but could probably use a logic
2725 rewrite to keep all the different types of efep straight. */
2727 if ((ir.efep == efepNO) && (!ir.bSimTemp))
2732 const t_lambda *fep = ir.fepvals;
2735 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2736 if checkpoint is set -- a kludge is in for now
2740 for (int i = 0; i < efptNR; i++)
2743 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2744 if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2746 thisLambda = fep->init_lambda;
2750 thisLambda = fep->all_lambda[i][fep->init_fep_state];
2754 lambda[i] = thisLambda;
2756 if (lam0 != nullptr)
2758 lam0[i] = thisLambda;
2763 /* need to rescale control temperatures to match current state */
2764 for (int i = 0; i < ir.opts.ngtc; i++)
2766 if (ir.opts.ref_t[i] > 0)
2768 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2773 /* Send to the log the information on the current lambdas */
2774 if (fplog != nullptr)
2776 fprintf(fplog, "Initial vector of lambda components:[ ");
2777 for (const auto &l : lambda)
2779 fprintf(fplog, "%10.4f ", l);
2781 fprintf(fplog, "]\n");