2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
125 void print_time(FILE *out,
126 gmx_walltime_accounting_t walltime_accounting,
128 const t_inputrec *ir,
132 double dt, elapsed_seconds, time_per_step;
141 fputs(gmx::int64ToString(step).c_str(), out);
144 if ((step >= ir->nstlist))
146 double seconds_since_epoch = gmx_gettime();
147 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
149 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
155 finish = static_cast<time_t>(seconds_since_epoch + dt);
156 auto timebuf = gmx_ctime_r(&finish);
157 timebuf.erase(timebuf.find_first_of('\n'));
158 fputs(", will finish ", out);
159 fputs(timebuf.c_str(), out);
163 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
168 fprintf(out, " performance: %.1f ns/day ",
169 ir->delta_t/1000*24*60*60/time_per_step);
178 GMX_UNUSED_VALUE(cr);
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
192 time_t temp_time = static_cast<time_t>(the_time);
194 auto timebuf = gmx_ctime_r(&temp_time);
196 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
199 void print_start(FILE *fplog, const t_commrec *cr,
200 gmx_walltime_accounting_t walltime_accounting,
205 sprintf(buf, "Started %s", name);
206 print_date_and_time(fplog, cr->nodeid, buf,
207 walltime_accounting_get_start_time_stamp(walltime_accounting));
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
212 const int end = forceToAdd.size();
214 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216 for (int i = 0; i < end; i++)
218 rvec_inc(f[i], forceToAdd[i]);
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223 tensor vir_part, const t_graph *graph, const matrix box,
224 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
226 /* The short-range virial from surrounding boxes */
227 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
230 /* Calculate partial virial, for local atoms only, based on short range.
231 * Total virial is computed in global_stat, called from do_md
233 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
238 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
242 static void pull_potential_wrapper(const t_commrec *cr,
243 const t_inputrec *ir,
244 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245 gmx::ForceWithVirial *force,
246 const t_mdatoms *mdatoms,
247 gmx_enerdata_t *enerd,
250 gmx_wallcycle_t wcycle)
255 /* Calculate the center of mass forces, this requires communication,
256 * which is why pull_potential is called close to other communication.
258 wallcycle_start(wcycle, ewcPULLPOT);
259 set_pbc(&pbc, ir->ePBC, box);
261 enerd->term[F_COM_PULL] +=
262 pull_potential(ir->pull_work, mdatoms, &pbc,
263 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265 wallcycle_stop(wcycle, ewcPULLPOT);
268 static void pme_receive_force_ener(const t_commrec *cr,
269 gmx::ForceWithVirial *forceWithVirial,
270 gmx_enerdata_t *enerd,
271 gmx_wallcycle_t wcycle)
273 real e_q, e_lj, dvdl_q, dvdl_lj;
274 float cycles_ppdpme, cycles_seppme;
276 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
279 /* In case of node-splitting, the PP nodes receive the long-range
280 * forces, virial and energy from the PME nodes here.
282 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
285 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
287 enerd->term[F_COUL_RECIP] += e_q;
288 enerd->term[F_LJ_RECIP] += e_lj;
289 enerd->dvdl_lin[efptCOUL] += dvdl_q;
290 enerd->dvdl_lin[efptVDW] += dvdl_lj;
294 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
296 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
299 static void print_large_forces(FILE *fp,
307 real force2Tolerance = gmx::square(forceTolerance);
308 gmx::index numNonFinite = 0;
309 for (int i = 0; i < md->homenr; i++)
311 real force2 = norm2(f[i]);
312 bool nonFinite = !std::isfinite(force2);
313 if (force2 >= force2Tolerance || nonFinite)
315 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
317 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
324 if (numNonFinite > 0)
326 /* Note that with MPI this fatal call on one rank might interrupt
327 * the printing on other ranks. But we can only avoid that with
328 * an expensive MPI barrier that we would need at each step.
330 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
334 static void post_process_forces(const t_commrec *cr,
337 gmx_wallcycle_t wcycle,
338 const gmx_localtop_t *top,
342 gmx::ForceWithVirial *forceWithVirial,
344 const t_mdatoms *mdatoms,
345 const t_graph *graph,
346 const t_forcerec *fr,
347 const gmx_vsite_t *vsite,
350 if (fr->haveDirectVirialContributions)
352 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
356 /* Spread the mesh force on virtual sites to the other particles...
357 * This is parallellized. MPI communication is performed
358 * if the constructing atoms aren't local.
360 matrix virial = { { 0 } };
361 spread_vsite_f(vsite, x, fDirectVir, nullptr,
362 (flags & GMX_FORCE_VIRIAL) != 0, virial,
364 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365 forceWithVirial->addVirialContribution(virial);
368 if (flags & GMX_FORCE_VIRIAL)
370 /* Now add the forces, this is local */
371 sum_forces(f, forceWithVirial->force_);
373 /* Add the direct virial contributions */
374 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
379 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
384 if (fr->print_force >= 0)
386 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
390 static void do_nb_verlet(t_forcerec *fr,
391 const interaction_const_t *ic,
392 gmx_enerdata_t *enerd,
394 const Nbnxm::InteractionLocality ilocality,
398 gmx_wallcycle_t wcycle)
400 if (!(flags & GMX_FORCE_NONBONDED))
402 /* skip non-bonded calculation */
406 nonbonded_verlet_t *nbv = fr->nbv;
407 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
409 /* GPU kernel launch overhead is already timed separately */
410 if (fr->cutoff_scheme != ecutsVERLET)
412 gmx_incons("Invalid cut-off scheme passed!");
415 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
417 if (!bUsingGpuKernels)
419 /* When dynamic pair-list pruning is requested, we need to prune
420 * at nstlistPrune steps.
422 if (nbv->listParams->useDynamicPruning &&
423 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
425 /* Prune the pair-list beyond fr->ic->rlistPrune using
426 * the current coordinates of the atoms.
428 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
429 NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
430 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
433 wallcycle_sub_start(wcycle, ewcsNONBONDED);
436 NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
438 if (!bUsingGpuKernels)
440 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
444 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
448 const t_mdatoms *mdatoms,
451 gmx_enerdata_t *enerd,
454 gmx_wallcycle_t wcycle)
457 nb_kernel_data_t kernel_data;
459 real dvdl_nb[efptNR];
464 /* Add short-range interactions */
465 donb_flags |= GMX_NONBONDED_DO_SR;
467 /* Currently all group scheme kernels always calculate (shift-)forces */
468 if (flags & GMX_FORCE_FORCES)
470 donb_flags |= GMX_NONBONDED_DO_FORCE;
472 if (flags & GMX_FORCE_VIRIAL)
474 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
476 if (flags & GMX_FORCE_ENERGY)
478 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
481 kernel_data.flags = donb_flags;
482 kernel_data.lambda = lambda;
483 kernel_data.dvdl = dvdl_nb;
485 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
486 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
488 /* reset free energy components */
489 for (i = 0; i < efptNR; i++)
494 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
496 wallcycle_sub_start(wcycle, ewcsNONBONDED);
497 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
498 for (th = 0; th < nbl_lists->nnbl; th++)
502 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
503 x, f, fr, mdatoms, &kernel_data, nrnb);
505 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
508 if (fepvals->sc_alpha != 0)
510 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
511 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
515 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
516 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
519 /* If we do foreign lambda and we have soft-core interactions
520 * we have to recalculate the (non-linear) energies contributions.
522 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
524 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
525 kernel_data.lambda = lam_i;
526 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
527 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
528 /* Note that we add to kernel_data.dvdl, but ignore the result */
530 for (i = 0; i < enerd->n_lambda; i++)
532 for (j = 0; j < efptNR; j++)
534 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
536 reset_foreign_enerdata(enerd);
537 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
538 for (th = 0; th < nbl_lists->nnbl; th++)
542 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
543 x, f, fr, mdatoms, &kernel_data, nrnb);
545 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
548 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
549 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
553 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
556 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
558 return nbv != nullptr && nbv->bUseGPU;
561 static inline void clear_rvecs_omp(int n, rvec v[])
563 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
565 /* Note that we would like to avoid this conditional by putting it
566 * into the omp pragma instead, but then we still take the full
567 * omp parallel for overhead (at least with gcc5).
571 for (int i = 0; i < n; i++)
578 #pragma omp parallel for num_threads(nth) schedule(static)
579 for (int i = 0; i < n; i++)
586 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
588 * \param groupOptions Group options, containing T-coupling options
590 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
592 real nrdfCoupled = 0;
593 real nrdfUncoupled = 0;
594 real kineticEnergy = 0;
595 for (int g = 0; g < groupOptions.ngtc; g++)
597 if (groupOptions.tau_t[g] >= 0)
599 nrdfCoupled += groupOptions.nrdf[g];
600 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
604 nrdfUncoupled += groupOptions.nrdf[g];
608 /* This conditional with > also catches nrdf=0 */
609 if (nrdfCoupled > nrdfUncoupled)
611 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
619 /*! \brief This routine checks that the potential energy is finite.
621 * Always checks that the potential energy is finite. If step equals
622 * inputrec.init_step also checks that the magnitude of the potential energy
623 * is reasonable. Terminates with a fatal error when a check fails.
624 * Note that passing this check does not guarantee finite forces,
625 * since those use slightly different arithmetics. But in most cases
626 * there is just a narrow coordinate range where forces are not finite
627 * and energies are finite.
629 * \param[in] step The step number, used for checking and printing
630 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
631 * \param[in] inputrec The input record
633 static void checkPotentialEnergyValidity(int64_t step,
634 const gmx_enerdata_t &enerd,
635 const t_inputrec &inputrec)
637 /* Threshold valid for comparing absolute potential energy against
638 * the kinetic energy. Normally one should not consider absolute
639 * potential energy values, but with a factor of one million
640 * we should never get false positives.
642 constexpr real c_thresholdFactor = 1e6;
644 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
645 real averageKineticEnergy = 0;
646 /* We only check for large potential energy at the initial step,
647 * because that is by far the most likely step for this too occur
648 * and because computing the average kinetic energy is not free.
649 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
650 * before they become NaN.
652 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
654 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
657 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
658 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
660 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
663 energyIsNotFinite ? "not finite" : "extremely high",
665 enerd.term[F_COUL_SR],
666 energyIsNotFinite ? "non-finite" : "very high",
667 energyIsNotFinite ? " or Nan" : "");
671 /*! \brief Compute forces and/or energies for special algorithms
673 * The intention is to collect all calls to algorithms that compute
674 * forces on local atoms only and that do not contribute to the local
675 * virial sum (but add their virial contribution separately).
676 * Eventually these should likely all become ForceProviders.
677 * Within this function the intention is to have algorithms that do
678 * global communication at the end, so global barriers within the MD loop
679 * are as close together as possible.
681 * \param[in] fplog The log file
682 * \param[in] cr The communication record
683 * \param[in] inputrec The input record
684 * \param[in] awh The Awh module (nullptr if none in use).
685 * \param[in] enforcedRotation Enforced rotation module.
686 * \param[in] step The current MD step
687 * \param[in] t The current time
688 * \param[in,out] wcycle Wallcycle accounting struct
689 * \param[in,out] forceProviders Pointer to a list of force providers
690 * \param[in] box The unit cell
691 * \param[in] x The coordinates
692 * \param[in] mdatoms Per atom properties
693 * \param[in] lambda Array of free-energy lambda values
694 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
695 * \param[in,out] forceWithVirial Force and virial buffers
696 * \param[in,out] enerd Energy buffer
697 * \param[in,out] ed Essential dynamics pointer
698 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
700 * \todo Remove bNS, which is used incorrectly.
701 * \todo Convert all other algorithms called here to ForceProviders.
704 computeSpecialForces(FILE *fplog,
706 const t_inputrec *inputrec,
708 gmx_enfrot *enforcedRotation,
711 gmx_wallcycle_t wcycle,
712 ForceProviders *forceProviders,
714 gmx::ArrayRef<const gmx::RVec> x,
715 const t_mdatoms *mdatoms,
718 gmx::ForceWithVirial *forceWithVirial,
719 gmx_enerdata_t *enerd,
723 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
725 /* NOTE: Currently all ForceProviders only provide forces.
726 * When they also provide energies, remove this conditional.
730 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
731 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
733 /* Collect forces from modules */
734 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
737 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
739 pull_potential_wrapper(cr, inputrec, box, x,
741 mdatoms, enerd, lambda, t,
746 enerd->term[F_COM_PULL] +=
747 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
749 t, step, wcycle, fplog);
753 rvec *f = as_rvec_array(forceWithVirial->force_.data());
755 /* Add the forces from enforced rotation potentials (if any) */
758 wallcycle_start(wcycle, ewcROTadd);
759 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
760 wallcycle_stop(wcycle, ewcROTadd);
765 /* Note that since init_edsam() is called after the initialization
766 * of forcerec, edsam doesn't request the noVirSum force buffer.
767 * Thus if no other algorithm (e.g. PME) requires it, the forces
768 * here will contribute to the virial.
770 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
773 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
774 if (inputrec->bIMD && computeForces)
776 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
780 /*! \brief Launch the prepare_step and spread stages of PME GPU.
782 * \param[in] pmedata The PME structure
783 * \param[in] box The box matrix
784 * \param[in] x Coordinate array
785 * \param[in] flags Force flags
786 * \param[in] pmeFlags PME flags
787 * \param[in] wcycle The wallcycle structure
789 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
794 gmx_wallcycle_t wcycle)
796 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
797 pme_gpu_launch_spread(pmedata, x, wcycle);
800 /*! \brief Launch the FFT and gather stages of PME GPU
802 * This function only implements setting the output forces (no accumulation).
804 * \param[in] pmedata The PME structure
805 * \param[in] wcycle The wallcycle structure
807 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
808 gmx_wallcycle_t wcycle)
810 pme_gpu_launch_complex_transforms(pmedata, wcycle);
811 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
815 * Polling wait for either of the PME or nonbonded GPU tasks.
817 * Instead of a static order in waiting for GPU tasks, this function
818 * polls checking which of the two tasks completes first, and does the
819 * associated force buffer reduction overlapped with the other task.
820 * By doing that, unlike static scheduling order, it can always overlap
821 * one of the reductions, regardless of the GPU task completion order.
823 * \param[in] nbv Nonbonded verlet structure
824 * \param[in,out] pmedata PME module data
825 * \param[in,out] force Force array to reduce task outputs into.
826 * \param[in,out] forceWithVirial Force and virial buffers
827 * \param[in,out] fshift Shift force output vector results are reduced into
828 * \param[in,out] enerd Energy data structure results are reduced into
829 * \param[in] flags Force flags
830 * \param[in] pmeFlags PME flags
831 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
832 * \param[in] wcycle The wallcycle structure
834 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
836 gmx::ArrayRefWithPadding<gmx::RVec> *force,
837 gmx::ForceWithVirial *forceWithVirial,
839 gmx_enerdata_t *enerd,
843 gmx_wallcycle_t wcycle)
845 bool isPmeGpuDone = false;
846 bool isNbGpuDone = false;
849 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
851 while (!isPmeGpuDone || !isNbGpuDone)
855 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
856 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
861 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
862 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
863 isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
865 Nbnxm::AtomLocality::Local,
867 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
868 fshift, completionType);
869 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
870 // To get the call count right, when the task finished we
871 // issue a start/stop.
872 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
873 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
876 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
877 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
879 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
880 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
887 * Launch the dynamic rolling pruning GPU task.
889 * We currently alternate local/non-local list pruning in odd-even steps
890 * (only pruning every second step without DD).
892 * \param[in] cr The communication record
893 * \param[in] nbv Nonbonded verlet structure
894 * \param[in] inputrec The input record
895 * \param[in] step The current MD step
897 static inline void launchGpuRollingPruning(const t_commrec *cr,
898 const nonbonded_verlet_t *nbv,
899 const t_inputrec *inputrec,
902 /* We should not launch the rolling pruning kernel at a search
903 * step or just before search steps, since that's useless.
904 * Without domain decomposition we prune at even steps.
905 * With domain decomposition we alternate local and non-local
906 * pruning at even and odd steps.
908 int numRollingParts = nbv->listParams->numRollingParts;
909 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
910 int stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
911 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
912 if (stepWithCurrentList > 0 &&
913 stepWithCurrentList < inputrec->nstlist - 1 &&
914 (stepIsEven || DOMAINDECOMP(cr)))
916 Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
917 stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
922 static void do_force_cutsVERLET(FILE *fplog,
924 const gmx_multisim_t *ms,
925 const t_inputrec *inputrec,
927 gmx_enfrot *enforcedRotation,
930 gmx_wallcycle_t wcycle,
931 const gmx_localtop_t *top,
932 const gmx_groups_t * /* groups */,
933 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
935 gmx::ArrayRefWithPadding<gmx::RVec> force,
937 const t_mdatoms *mdatoms,
938 gmx_enerdata_t *enerd, t_fcdata *fcd,
942 gmx::PpForceWorkload *ppForceWorkload,
943 interaction_const_t *ic,
944 const gmx_vsite_t *vsite,
949 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
950 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
954 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
955 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
956 rvec vzero, box_diag;
957 float cycles_pme, cycles_wait_gpu;
958 nonbonded_verlet_t *nbv = fr->nbv;
960 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
961 bNS = ((flags & GMX_FORCE_NS) != 0);
962 bFillGrid = (bNS && bStateChanged);
963 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
964 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
965 bUseGPU = fr->nbv->bUseGPU;
966 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
968 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
969 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
970 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
971 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
972 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
973 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
974 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
975 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
977 /* At a search step we need to start the first balancing region
978 * somewhere early inside the step after communication during domain
979 * decomposition (and not during the previous step as usual).
982 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
984 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
990 const int homenr = mdatoms->homenr;
992 clear_mat(vir_force);
994 if (DOMAINDECOMP(cr))
996 cg1 = cr->dd->globalAtomGroupIndices.size();
1009 update_forcerec(fr, box);
1011 if (inputrecNeedMutot(inputrec))
1013 /* Calculate total (local) dipole moment in a temporary common array.
1014 * This makes it possible to sum them over nodes faster.
1016 calc_mu(start, homenr,
1017 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1022 if (fr->ePBC != epbcNONE)
1024 /* Compute shift vectors every step,
1025 * because of pressure coupling or box deformation!
1027 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1029 calc_shifts(box, fr->shift_vec);
1034 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1035 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1037 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1039 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1043 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1044 fr->shift_vec, nbv->nbat);
1047 if (!thisRankHasDuty(cr, DUTY_PME))
1049 /* Send particle coordinates to the pme nodes.
1050 * Since this is only implemented for domain decomposition
1051 * and domain decomposition does not use the graph,
1052 * we do not need to worry about shifting.
1054 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1055 lambda[efptCOUL], lambda[efptVDW],
1056 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1059 #endif /* GMX_MPI */
1063 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1066 /* do gridding for pair search */
1069 if (graph && bStateChanged)
1071 /* Calculate intramolecular shift vectors to make molecules whole */
1072 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1076 box_diag[XX] = box[XX][XX];
1077 box_diag[YY] = box[YY][YY];
1078 box_diag[ZZ] = box[ZZ][ZZ];
1080 wallcycle_start(wcycle, ewcNS);
1081 if (!DOMAINDECOMP(cr))
1083 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1084 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1086 nullptr, 0, mdatoms->homenr, -1,
1087 fr->cginfo, x.unpaddedArrayRef(),
1089 nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1091 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1095 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1096 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1097 fr->cginfo, x.unpaddedArrayRef(),
1098 nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1100 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1103 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1105 wallcycle_stop(wcycle, ewcNS);
1108 /* initialize the GPU atom data and copy shift vector */
1111 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1112 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1116 Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1119 Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1121 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1123 if (bNS && fr->gpuBonded)
1125 /* Now we put all atoms on the grid, we can assign bonded
1126 * interactions to the GPU, where the grid order is
1127 * needed. Also the xq, f and fshift device buffers have
1128 * been reallocated if needed, so the bonded code can
1129 * learn about them. */
1130 // TODO the xq, f, and fshift buffers are now shared
1131 // resources, so they should be maintained by a
1132 // higher-level object than the nb module.
1133 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1135 Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1136 Nbnxm::gpu_get_f(nbv->gpu_nbv),
1137 Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1138 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1141 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1144 /* do local pair search */
1147 nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
1149 wallcycle_start_nocount(wcycle, ewcNS);
1150 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1151 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1153 nbv->listParams->rlistOuter,
1154 nbv->min_ci_balanced,
1156 Nbnxm::InteractionLocality::Local,
1157 nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1159 pairlistSet.outerListCreationStep = step;
1160 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1162 nbnxnPrepareListForDynamicPruning(&pairlistSet);
1164 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1168 /* initialize local pair-list on the GPU */
1169 Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1170 pairlistSet.nblGpu[0],
1171 Nbnxm::InteractionLocality::Local);
1173 wallcycle_stop(wcycle, ewcNS);
1177 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1178 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1184 if (DOMAINDECOMP(cr))
1186 ddOpenBalanceRegionGpu(cr->dd);
1189 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1191 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1192 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1193 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1195 // bonded work not split into separate local and non-local, so with DD
1196 // we can only launch the kernel after non-local coordinates have been received.
1197 if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1199 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1200 fr->gpuBonded->launchKernels(fr, flags, box);
1201 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1204 /* launch local nonbonded work on GPU */
1205 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1206 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1207 step, nrnb, wcycle);
1208 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1209 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1214 // In PME GPU and mixed mode we launch FFT / gather after the
1215 // X copy/transform to allow overlap as well as after the GPU NB
1216 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1217 // the nonbonded kernel.
1218 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1221 /* Communicate coordinates and sum dipole if necessary +
1222 do non-local pair search */
1223 if (DOMAINDECOMP(cr))
1225 nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
1229 wallcycle_start_nocount(wcycle, ewcNS);
1230 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1232 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1234 nbv->listParams->rlistOuter,
1235 nbv->min_ci_balanced,
1237 Nbnxm::InteractionLocality::NonLocal,
1238 nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1240 pairlistSet.outerListCreationStep = step;
1241 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1243 nbnxnPrepareListForDynamicPruning(&pairlistSet);
1245 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1247 if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
1249 /* initialize non-local pair-list on the GPU */
1250 Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1251 pairlistSet.nblGpu[0],
1252 Nbnxm::InteractionLocality::NonLocal);
1254 wallcycle_stop(wcycle, ewcNS);
1258 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1260 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1261 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1267 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1269 /* launch non-local nonbonded tasks on GPU */
1270 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1271 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1272 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1274 if (ppForceWorkload->haveGpuBondedWork)
1276 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1277 fr->gpuBonded->launchKernels(fr, flags, box);
1278 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1281 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1283 step, nrnb, wcycle);
1284 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1286 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1292 /* launch D2H copy-back F */
1293 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1294 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1295 if (DOMAINDECOMP(cr))
1297 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1298 flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1300 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1301 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1302 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1304 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1306 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1307 fr->gpuBonded->launchEnergyTransfer();
1308 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1310 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1313 if (bStateChanged && inputrecNeedMutot(inputrec))
1317 gmx_sumd(2*DIM, mu, cr);
1318 ddReopenBalanceRegionCpu(cr->dd);
1321 for (i = 0; i < 2; i++)
1323 for (j = 0; j < DIM; j++)
1325 fr->mu_tot[i][j] = mu[i*DIM + j];
1329 if (fr->efep == efepNO)
1331 copy_rvec(fr->mu_tot[0], mu_tot);
1335 for (j = 0; j < DIM; j++)
1338 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1339 lambda[efptCOUL]*fr->mu_tot[1][j];
1343 /* Reset energies */
1344 reset_enerdata(enerd);
1345 clear_rvecs(SHIFTS, fr->fshift);
1347 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1349 wallcycle_start(wcycle, ewcPPDURINGPME);
1350 dd_force_flop_start(cr->dd, nrnb);
1355 wallcycle_start(wcycle, ewcROT);
1356 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1357 wallcycle_stop(wcycle, ewcROT);
1360 /* Temporary solution until all routines take PaddedRVecVector */
1361 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1363 /* Start the force cycle counter.
1364 * Note that a different counter is used for dynamic load balancing.
1366 wallcycle_start(wcycle, ewcFORCE);
1368 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1371 /* If we need to compute the virial, we might need a separate
1372 * force buffer for algorithms for which the virial is calculated
1373 * directly, such as PME.
1375 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1377 forceRef = *fr->forceBufferForDirectVirialContributions;
1379 /* TODO: update comment
1380 * We only compute forces on local atoms. Note that vsites can
1381 * spread to non-local atoms, but that part of the buffer is
1382 * cleared separately in the vsite spreading code.
1384 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1386 /* Clear the short- and long-range forces */
1387 clear_rvecs_omp(fr->natoms_force_constr, f);
1390 /* forceWithVirial uses the local atom range only */
1391 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1393 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1395 clear_pull_forces(inputrec->pull_work);
1398 /* We calculate the non-bonded forces, when done on the CPU, here.
1399 * We do this before calling do_force_lowlevel, because in that
1400 * function, the listed forces are calculated before PME, which
1401 * does communication. With this order, non-bonded and listed
1402 * force calculation imbalance can be balanced out by the domain
1403 * decomposition load balancing.
1408 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1409 step, nrnb, wcycle);
1412 if (fr->efep != efepNO)
1414 /* Calculate the local and non-local free energy interactions here.
1415 * Happens here on the CPU both with and without GPU.
1417 if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
1419 do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
1420 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1421 inputrec->fepvals, lambda,
1422 enerd, flags, nrnb, wcycle);
1425 if (DOMAINDECOMP(cr) &&
1426 fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1428 do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
1429 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1430 inputrec->fepvals, lambda,
1431 enerd, flags, nrnb, wcycle);
1437 if (DOMAINDECOMP(cr))
1439 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1440 step, nrnb, wcycle);
1443 const Nbnxm::InteractionLocality iloc =
1444 (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1446 /* Add all the non-bonded force to the normal force array.
1447 * This can be split into a local and a non-local part when overlapping
1448 * communication with calculation with domain decomposition.
1450 wallcycle_stop(wcycle, ewcFORCE);
1452 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1454 wallcycle_start_nocount(wcycle, ewcFORCE);
1456 /* if there are multiple fshift output buffers reduce them */
1457 if ((flags & GMX_FORCE_VIRIAL) &&
1458 nbv->grp[iloc].nbl_lists.nnbl > 1)
1460 /* This is not in a subcounter because it takes a
1461 negligible and constant-sized amount of time */
1462 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1467 /* update QMMMrec, if necessary */
1470 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1473 /* Compute the bonded and non-bonded energies and optionally forces */
1474 do_force_lowlevel(fr, inputrec, &(top->idef),
1475 cr, ms, nrnb, wcycle, mdatoms,
1476 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1477 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1478 flags, &cycles_pme);
1480 wallcycle_stop(wcycle, ewcFORCE);
1482 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1484 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1485 flags, &forceWithVirial, enerd,
1490 /* wait for non-local forces (or calculate in emulation mode) */
1491 if (DOMAINDECOMP(cr))
1495 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1496 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1497 flags, Nbnxm::AtomLocality::NonLocal,
1498 ppForceWorkload->haveGpuBondedWork,
1499 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1501 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1505 wallcycle_start_nocount(wcycle, ewcFORCE);
1506 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1507 step, nrnb, wcycle);
1508 wallcycle_stop(wcycle, ewcFORCE);
1511 /* skip the reduction if there was no non-local work to do */
1512 if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
1514 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1515 nbv->nbat, f, wcycle);
1520 if (DOMAINDECOMP(cr))
1522 /* We are done with the CPU compute.
1523 * We will now communicate the non-local forces.
1524 * If we use a GPU this will overlap with GPU work, so in that case
1525 * we do not close the DD force balancing region here.
1527 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1529 ddCloseBalanceRegionCpu(cr->dd);
1533 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1537 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1538 // an alternating wait/reduction scheme.
1539 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1540 if (alternateGpuWait)
1542 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1545 if (!alternateGpuWait && useGpuPme)
1547 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1550 /* Wait for local GPU NB outputs on the non-alternating wait path */
1551 if (!alternateGpuWait && bUseGPU)
1553 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1554 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1555 * but even with a step of 0.1 ms the difference is less than 1%
1558 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1560 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1561 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1562 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1563 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1565 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1567 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1569 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1570 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1572 /* We measured few cycles, it could be that the kernel
1573 * and transfer finished earlier and there was no actual
1574 * wait time, only API call overhead.
1575 * Then the actual time could be anywhere between 0 and
1576 * cycles_wait_est. We will use half of cycles_wait_est.
1578 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1580 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1584 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1586 // NOTE: emulation kernel is not included in the balancing region,
1587 // but emulation mode does not target performance anyway
1588 wallcycle_start_nocount(wcycle, ewcFORCE);
1589 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1590 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1591 step, nrnb, wcycle);
1592 wallcycle_stop(wcycle, ewcFORCE);
1597 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1602 /* now clear the GPU outputs while we finish the step on the CPU */
1603 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1604 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1605 Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1607 /* Is dynamic pair-list pruning activated? */
1608 if (nbv->listParams->useDynamicPruning)
1610 launchGpuRollingPruning(cr, nbv, inputrec, step);
1612 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1613 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1616 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1618 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1619 // in principle this should be included in the DD balancing region,
1620 // but generally it is infrequent so we'll omit it for the sake of
1622 fr->gpuBonded->accumulateEnergyTerms(enerd);
1623 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1625 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1626 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1627 fr->gpuBonded->clearEnergies();
1628 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1629 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1632 /* Do the nonbonded GPU (or emulation) force buffer reduction
1633 * on the non-alternating path. */
1634 if (bUseOrEmulGPU && !alternateGpuWait)
1636 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1637 nbv->nbat, f, wcycle);
1639 if (DOMAINDECOMP(cr))
1641 dd_force_flop_stop(cr->dd, nrnb);
1646 /* If we have NoVirSum forces, but we do not calculate the virial,
1647 * we sum fr->f_novirsum=f later.
1649 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1651 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1652 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1655 if (flags & GMX_FORCE_VIRIAL)
1657 /* Calculation of the virial must be done after vsites! */
1658 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1659 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1663 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1665 /* In case of node-splitting, the PP nodes receive the long-range
1666 * forces, virial and energy from the PME nodes here.
1668 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1673 post_process_forces(cr, step, nrnb, wcycle,
1674 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1675 vir_force, mdatoms, graph, fr, vsite,
1679 if (flags & GMX_FORCE_ENERGY)
1681 /* Sum the potential energy terms from group contributions */
1682 sum_epot(&(enerd->grpp), enerd->term);
1684 if (!EI_TPI(inputrec->eI))
1686 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1691 static void do_force_cutsGROUP(FILE *fplog,
1692 const t_commrec *cr,
1693 const gmx_multisim_t *ms,
1694 const t_inputrec *inputrec,
1696 gmx_enfrot *enforcedRotation,
1699 gmx_wallcycle_t wcycle,
1700 gmx_localtop_t *top,
1701 const gmx_groups_t *groups,
1702 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1704 gmx::ArrayRefWithPadding<gmx::RVec> force,
1706 const t_mdatoms *mdatoms,
1707 gmx_enerdata_t *enerd,
1712 const gmx_vsite_t *vsite,
1717 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1718 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1722 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1726 const int start = 0;
1727 const int homenr = mdatoms->homenr;
1729 clear_mat(vir_force);
1732 if (DOMAINDECOMP(cr))
1734 cg1 = cr->dd->globalAtomGroupIndices.size();
1745 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1746 bNS = ((flags & GMX_FORCE_NS) != 0);
1747 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1748 bFillGrid = (bNS && bStateChanged);
1749 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1750 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1754 update_forcerec(fr, box);
1756 if (inputrecNeedMutot(inputrec))
1758 /* Calculate total (local) dipole moment in a temporary common array.
1759 * This makes it possible to sum them over nodes faster.
1761 calc_mu(start, homenr,
1762 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1767 if (fr->ePBC != epbcNONE)
1769 /* Compute shift vectors every step,
1770 * because of pressure coupling or box deformation!
1772 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1774 calc_shifts(box, fr->shift_vec);
1779 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1780 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1781 inc_nrnb(nrnb, eNR_CGCM, homenr);
1782 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1784 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1786 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1791 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1792 inc_nrnb(nrnb, eNR_CGCM, homenr);
1795 if (bCalcCGCM && gmx_debug_at)
1797 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1801 if (!thisRankHasDuty(cr, DUTY_PME))
1803 /* Send particle coordinates to the pme nodes.
1804 * Since this is only implemented for domain decomposition
1805 * and domain decomposition does not use the graph,
1806 * we do not need to worry about shifting.
1808 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1809 lambda[efptCOUL], lambda[efptVDW],
1810 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1813 #endif /* GMX_MPI */
1815 /* Communicate coordinates and sum dipole if necessary */
1816 if (DOMAINDECOMP(cr))
1818 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1820 /* No GPU support, no move_x overlap, so reopen the balance region here */
1821 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1823 ddReopenBalanceRegionCpu(cr->dd);
1827 if (inputrecNeedMutot(inputrec))
1833 gmx_sumd(2*DIM, mu, cr);
1834 ddReopenBalanceRegionCpu(cr->dd);
1836 for (i = 0; i < 2; i++)
1838 for (j = 0; j < DIM; j++)
1840 fr->mu_tot[i][j] = mu[i*DIM + j];
1844 if (fr->efep == efepNO)
1846 copy_rvec(fr->mu_tot[0], mu_tot);
1850 for (j = 0; j < DIM; j++)
1853 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1858 /* Reset energies */
1859 reset_enerdata(enerd);
1860 clear_rvecs(SHIFTS, fr->fshift);
1864 wallcycle_start(wcycle, ewcNS);
1866 if (graph && bStateChanged)
1868 /* Calculate intramolecular shift vectors to make molecules whole */
1869 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1872 /* Do the actual neighbour searching */
1874 groups, top, mdatoms,
1875 cr, nrnb, bFillGrid);
1877 wallcycle_stop(wcycle, ewcNS);
1880 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1882 wallcycle_start(wcycle, ewcPPDURINGPME);
1883 dd_force_flop_start(cr->dd, nrnb);
1888 wallcycle_start(wcycle, ewcROT);
1889 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1890 wallcycle_stop(wcycle, ewcROT);
1893 /* Temporary solution until all routines take PaddedRVecVector */
1894 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1896 /* Start the force cycle counter.
1897 * Note that a different counter is used for dynamic load balancing.
1899 wallcycle_start(wcycle, ewcFORCE);
1901 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1904 /* If we need to compute the virial, we might need a separate
1905 * force buffer for algorithms for which the virial is calculated
1906 * separately, such as PME.
1908 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1910 forceRef = *fr->forceBufferForDirectVirialContributions;
1912 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1915 /* Clear the short- and long-range forces */
1916 clear_rvecs(fr->natoms_force_constr, f);
1919 /* forceWithVirial might need the full force atom range */
1920 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1922 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1924 clear_pull_forces(inputrec->pull_work);
1927 /* update QMMMrec, if necessary */
1930 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1933 /* Compute the bonded and non-bonded energies and optionally forces */
1934 do_force_lowlevel(fr, inputrec, &(top->idef),
1935 cr, ms, nrnb, wcycle, mdatoms,
1936 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1937 box, inputrec->fepvals, lambda,
1938 graph, &(top->excls), fr->mu_tot,
1942 wallcycle_stop(wcycle, ewcFORCE);
1944 if (DOMAINDECOMP(cr))
1946 dd_force_flop_stop(cr->dd, nrnb);
1948 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1950 ddCloseBalanceRegionCpu(cr->dd);
1954 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1956 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1957 flags, &forceWithVirial, enerd,
1962 /* Communicate the forces */
1963 if (DOMAINDECOMP(cr))
1965 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1966 /* Do we need to communicate the separate force array
1967 * for terms that do not contribute to the single sum virial?
1968 * Position restraints and electric fields do not introduce
1969 * inter-cg forces, only full electrostatics methods do.
1970 * When we do not calculate the virial, fr->f_novirsum = f,
1971 * so we have already communicated these forces.
1973 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1974 (flags & GMX_FORCE_VIRIAL))
1976 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1980 /* If we have NoVirSum forces, but we do not calculate the virial,
1981 * we sum fr->f_novirsum=f later.
1983 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1985 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1986 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1989 if (flags & GMX_FORCE_VIRIAL)
1991 /* Calculation of the virial must be done after vsites! */
1992 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1993 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1997 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1999 /* In case of node-splitting, the PP nodes receive the long-range
2000 * forces, virial and energy from the PME nodes here.
2002 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2007 post_process_forces(cr, step, nrnb, wcycle,
2008 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2009 vir_force, mdatoms, graph, fr, vsite,
2013 if (flags & GMX_FORCE_ENERGY)
2015 /* Sum the potential energy terms from group contributions */
2016 sum_epot(&(enerd->grpp), enerd->term);
2018 if (!EI_TPI(inputrec->eI))
2020 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2026 void do_force(FILE *fplog,
2027 const t_commrec *cr,
2028 const gmx_multisim_t *ms,
2029 const t_inputrec *inputrec,
2031 gmx_enfrot *enforcedRotation,
2034 gmx_wallcycle_t wcycle,
2035 gmx_localtop_t *top,
2036 const gmx_groups_t *groups,
2038 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
2040 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
2042 const t_mdatoms *mdatoms,
2043 gmx_enerdata_t *enerd,
2045 gmx::ArrayRef<real> lambda,
2048 gmx::PpForceWorkload *ppForceWorkload,
2049 const gmx_vsite_t *vsite,
2054 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2055 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2057 /* modify force flag if not doing nonbonded */
2058 if (!fr->bNonbonded)
2060 flags &= ~GMX_FORCE_NONBONDED;
2063 switch (inputrec->cutoff_scheme)
2066 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2067 awh, enforcedRotation, step, nrnb, wcycle,
2074 lambda.data(), graph,
2081 ddOpenBalanceRegion,
2082 ddCloseBalanceRegion);
2085 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2086 awh, enforcedRotation, step, nrnb, wcycle,
2093 lambda.data(), graph,
2097 ddOpenBalanceRegion,
2098 ddCloseBalanceRegion);
2101 gmx_incons("Invalid cut-off scheme passed!");
2104 /* In case we don't have constraints and are using GPUs, the next balancing
2105 * region starts here.
2106 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2107 * virial calculation and COM pulling, is not thus not included in
2108 * the balance timing, which is ok as most tasks do communication.
2110 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2112 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2117 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2118 const t_inputrec *ir, const t_mdatoms *md,
2121 int i, m, start, end;
2123 real dt = ir->delta_t;
2127 /* We need to allocate one element extra, since we might use
2128 * (unaligned) 4-wide SIMD loads to access rvec entries.
2130 snew(savex, state->natoms + 1);
2137 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2138 start, md->homenr, end);
2140 /* Do a first constrain to reset particles... */
2141 step = ir->init_step;
2144 char buf[STEPSTRSIZE];
2145 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2146 gmx_step_str(step, buf));
2150 /* constrain the current position */
2151 constr->apply(TRUE, FALSE,
2153 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2155 state->lambda[efptBONDED], &dvdl_dum,
2156 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2159 /* constrain the inital velocity, and save it */
2160 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2161 constr->apply(TRUE, FALSE,
2163 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2165 state->lambda[efptBONDED], &dvdl_dum,
2166 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2168 /* constrain the inital velocities at t-dt/2 */
2169 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2171 auto x = makeArrayRef(state->x).subArray(start, end);
2172 auto v = makeArrayRef(state->v).subArray(start, end);
2173 for (i = start; (i < end); i++)
2175 for (m = 0; (m < DIM); m++)
2177 /* Reverse the velocity */
2179 /* Store the position at t-dt in buf */
2180 savex[i][m] = x[i][m] + dt*v[i][m];
2183 /* Shake the positions at t=-dt with the positions at t=0
2184 * as reference coordinates.
2188 char buf[STEPSTRSIZE];
2189 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2190 gmx_step_str(step, buf));
2193 constr->apply(TRUE, FALSE,
2195 state->x.rvec_array(), savex, nullptr,
2197 state->lambda[efptBONDED], &dvdl_dum,
2198 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2200 for (i = start; i < end; i++)
2202 for (m = 0; m < DIM; m++)
2204 /* Re-reverse the velocities */
2214 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2215 double *enerout, double *virout)
2217 double enersum, virsum;
2218 double invscale, invscale2, invscale3;
2219 double r, ea, eb, ec, pa, pb, pc, pd;
2224 invscale = 1.0/scale;
2225 invscale2 = invscale*invscale;
2226 invscale3 = invscale*invscale2;
2228 /* Following summation derived from cubic spline definition,
2229 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2230 * the cubic spline. We first calculate the negative of the
2231 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2232 * add the more standard, abrupt cutoff correction to that result,
2233 * yielding the long-range correction for a switched function. We
2234 * perform both the pressure and energy loops at the same time for
2235 * simplicity, as the computational cost is low. */
2239 /* Since the dispersion table has been scaled down a factor
2240 * 6.0 and the repulsion a factor 12.0 to compensate for the
2241 * c6/c12 parameters inside nbfp[] being scaled up (to save
2242 * flops in kernels), we need to correct for this.
2253 for (ri = rstart; ri < rend; ++ri)
2257 eb = 2.0*invscale2*r;
2261 pb = 3.0*invscale2*r;
2262 pc = 3.0*invscale*r*r;
2265 /* this "8" is from the packing in the vdwtab array - perhaps
2266 should be defined? */
2268 offset = 8*ri + offstart;
2269 y0 = vdwtab[offset];
2270 f = vdwtab[offset+1];
2271 g = vdwtab[offset+2];
2272 h = vdwtab[offset+3];
2274 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2275 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2277 *enerout = 4.0*M_PI*enersum*tabfactor;
2278 *virout = 4.0*M_PI*virsum*tabfactor;
2281 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2283 double eners[2], virs[2], enersum, virsum;
2284 double r0, rc3, rc9;
2286 real scale, *vdwtab;
2288 fr->enershiftsix = 0;
2289 fr->enershifttwelve = 0;
2290 fr->enerdiffsix = 0;
2291 fr->enerdifftwelve = 0;
2293 fr->virdifftwelve = 0;
2295 const interaction_const_t *ic = fr->ic;
2297 if (eDispCorr != edispcNO)
2299 for (i = 0; i < 2; i++)
2304 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2305 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2306 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2307 (ic->vdwtype == evdwSHIFT) ||
2308 (ic->vdwtype == evdwSWITCH))
2310 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2311 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2312 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2315 "With dispersion correction rvdw-switch can not be zero "
2316 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2319 /* TODO This code depends on the logic in tables.c that
2320 constructs the table layout, which should be made
2321 explicit in future cleanup. */
2322 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2323 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2324 scale = fr->dispersionCorrectionTable->scale;
2325 vdwtab = fr->dispersionCorrectionTable->data;
2327 /* Round the cut-offs to exact table values for precision */
2328 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2329 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2331 /* The code below has some support for handling force-switching, i.e.
2332 * when the force (instead of potential) is switched over a limited
2333 * region. This leads to a constant shift in the potential inside the
2334 * switching region, which we can handle by adding a constant energy
2335 * term in the force-switch case just like when we do potential-shift.
2337 * For now this is not enabled, but to keep the functionality in the
2338 * code we check separately for switch and shift. When we do force-switch
2339 * the shifting point is rvdw_switch, while it is the cutoff when we
2340 * have a classical potential-shift.
2342 * For a pure potential-shift the potential has a constant shift
2343 * all the way out to the cutoff, and that is it. For other forms
2344 * we need to calculate the constant shift up to the point where we
2345 * start modifying the potential.
2347 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2353 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2354 (ic->vdwtype == evdwSHIFT))
2356 /* Determine the constant energy shift below rvdw_switch.
2357 * Table has a scale factor since we have scaled it down to compensate
2358 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2360 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2361 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2363 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2365 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2366 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2369 /* Add the constant part from 0 to rvdw_switch.
2370 * This integration from 0 to rvdw_switch overcounts the number
2371 * of interactions by 1, as it also counts the self interaction.
2372 * We will correct for this later.
2374 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2375 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2377 /* Calculate the contribution in the range [r0,r1] where we
2378 * modify the potential. For a pure potential-shift modifier we will
2379 * have ri0==ri1, and there will not be any contribution here.
2381 for (i = 0; i < 2; i++)
2385 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2386 eners[i] -= enersum;
2390 /* Alright: Above we compensated by REMOVING the parts outside r0
2391 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2393 * Regardless of whether r0 is the point where we start switching,
2394 * or the cutoff where we calculated the constant shift, we include
2395 * all the parts we are missing out to infinity from r0 by
2396 * calculating the analytical dispersion correction.
2398 eners[0] += -4.0*M_PI/(3.0*rc3);
2399 eners[1] += 4.0*M_PI/(9.0*rc9);
2400 virs[0] += 8.0*M_PI/rc3;
2401 virs[1] += -16.0*M_PI/(3.0*rc9);
2403 else if (ic->vdwtype == evdwCUT ||
2404 EVDW_PME(ic->vdwtype) ||
2405 ic->vdwtype == evdwUSER)
2407 if (ic->vdwtype == evdwUSER && fplog)
2410 "WARNING: using dispersion correction with user tables\n");
2413 /* Note that with LJ-PME, the dispersion correction is multiplied
2414 * by the difference between the actual C6 and the value of C6
2415 * that would produce the combination rule.
2416 * This means the normal energy and virial difference formulas
2420 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2422 /* Contribution beyond the cut-off */
2423 eners[0] += -4.0*M_PI/(3.0*rc3);
2424 eners[1] += 4.0*M_PI/(9.0*rc9);
2425 if (ic->vdw_modifier == eintmodPOTSHIFT)
2427 /* Contribution within the cut-off */
2428 eners[0] += -4.0*M_PI/(3.0*rc3);
2429 eners[1] += 4.0*M_PI/(3.0*rc9);
2431 /* Contribution beyond the cut-off */
2432 virs[0] += 8.0*M_PI/rc3;
2433 virs[1] += -16.0*M_PI/(3.0*rc9);
2438 "Dispersion correction is not implemented for vdw-type = %s",
2439 evdw_names[ic->vdwtype]);
2442 /* When we deprecate the group kernels the code below can go too */
2443 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2445 /* Calculate self-interaction coefficient (assuming that
2446 * the reciprocal-space contribution is constant in the
2447 * region that contributes to the self-interaction).
2449 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2451 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2452 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2455 fr->enerdiffsix = eners[0];
2456 fr->enerdifftwelve = eners[1];
2457 /* The 0.5 is due to the Gromacs definition of the virial */
2458 fr->virdiffsix = 0.5*virs[0];
2459 fr->virdifftwelve = 0.5*virs[1];
2463 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2464 const matrix box, real lambda, tensor pres, tensor virial,
2465 real *prescorr, real *enercorr, real *dvdlcorr)
2467 gmx_bool bCorrAll, bCorrPres;
2468 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2478 if (ir->eDispCorr != edispcNO)
2480 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2481 ir->eDispCorr == edispcAllEnerPres);
2482 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2483 ir->eDispCorr == edispcAllEnerPres);
2485 invvol = 1/det(box);
2488 /* Only correct for the interactions with the inserted molecule */
2489 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2494 dens = fr->numAtomsForDispersionCorrection*invvol;
2495 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2498 if (ir->efep == efepNO)
2500 avcsix = fr->avcsix[0];
2501 avctwelve = fr->avctwelve[0];
2505 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2506 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2509 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2510 *enercorr += avcsix*enerdiff;
2512 if (ir->efep != efepNO)
2514 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2518 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2519 *enercorr += avctwelve*enerdiff;
2520 if (fr->efep != efepNO)
2522 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2528 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2529 if (ir->eDispCorr == edispcAllEnerPres)
2531 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2533 /* The factor 2 is because of the Gromacs virial definition */
2534 spres = -2.0*invvol*svir*PRESFAC;
2536 for (m = 0; m < DIM; m++)
2538 virial[m][m] += svir;
2539 pres[m][m] += spres;
2544 /* Can't currently control when it prints, for now, just print when degugging */
2549 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2555 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2556 *enercorr, spres, svir);
2560 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2564 if (fr->efep != efepNO)
2566 *dvdlcorr += dvdlambda;
2571 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2572 const gmx_mtop_t *mtop, rvec x[],
2578 if (bFirst && fplog)
2580 fprintf(fplog, "Removing pbc first time\n");
2585 for (const gmx_molblock_t &molb : mtop->molblock)
2587 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2588 if (moltype.atoms.nr == 1 ||
2589 (!bFirst && moltype.cgs.nr == 1))
2591 /* Just one atom or charge group in the molecule, no PBC required */
2592 as += molb.nmol*moltype.atoms.nr;
2596 mk_graph_moltype(moltype, graph);
2598 for (mol = 0; mol < molb.nmol; mol++)
2600 mk_mshift(fplog, graph, ePBC, box, x+as);
2602 shift_self(graph, box, x+as);
2603 /* The molecule is whole now.
2604 * We don't need the second mk_mshift call as in do_pbc_first,
2605 * since we no longer need this graph.
2608 as += moltype.atoms.nr;
2616 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2617 const gmx_mtop_t *mtop, rvec x[])
2619 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2622 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2623 const gmx_mtop_t *mtop, rvec x[])
2625 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2628 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2631 nth = gmx_omp_nthreads_get(emntDefault);
2633 #pragma omp parallel for num_threads(nth) schedule(static)
2634 for (t = 0; t < nth; t++)
2638 size_t natoms = x.size();
2639 size_t offset = (natoms*t )/nth;
2640 size_t len = (natoms*(t + 1))/nth - offset;
2641 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2643 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2647 // TODO This can be cleaned up a lot, and move back to runner.cpp
2648 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2649 const t_inputrec *inputrec,
2650 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2651 gmx_walltime_accounting_t walltime_accounting,
2652 nonbonded_verlet_t *nbv,
2653 const gmx_pme_t *pme,
2654 gmx_bool bWriteStat)
2656 t_nrnb *nrnb_tot = nullptr;
2658 double nbfs = 0, mflop = 0;
2659 double elapsed_time,
2660 elapsed_time_over_all_ranks,
2661 elapsed_time_over_all_threads,
2662 elapsed_time_over_all_threads_over_all_ranks;
2663 /* Control whether it is valid to print a report. Only the
2664 simulation master may print, but it should not do so if the run
2665 terminated e.g. before a scheduled reset step. This is
2666 complicated by the fact that PME ranks are unaware of the
2667 reason why they were sent a pmerecvqxFINISH. To avoid
2668 communication deadlocks, we always do the communication for the
2669 report, even if we've decided not to write the report, because
2670 how long it takes to finish the run is not important when we've
2671 decided not to report on the simulation performance.
2673 Further, we only report performance for dynamical integrators,
2674 because those are the only ones for which we plan to
2675 consider doing any optimizations. */
2676 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2678 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2680 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2681 printReport = false;
2688 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2689 cr->mpi_comm_mysim);
2697 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2698 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2702 /* reduce elapsed_time over all MPI ranks in the current simulation */
2703 MPI_Allreduce(&elapsed_time,
2704 &elapsed_time_over_all_ranks,
2705 1, MPI_DOUBLE, MPI_SUM,
2706 cr->mpi_comm_mysim);
2707 elapsed_time_over_all_ranks /= cr->nnodes;
2708 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2709 * current simulation. */
2710 MPI_Allreduce(&elapsed_time_over_all_threads,
2711 &elapsed_time_over_all_threads_over_all_ranks,
2712 1, MPI_DOUBLE, MPI_SUM,
2713 cr->mpi_comm_mysim);
2718 elapsed_time_over_all_ranks = elapsed_time;
2719 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2724 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2731 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2733 print_dd_statistics(cr, inputrec, fplog);
2736 /* TODO Move the responsibility for any scaling by thread counts
2737 * to the code that handled the thread region, so that there's a
2738 * mechanism to keep cycle counting working during the transition
2739 * to task parallelism. */
2740 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2741 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2742 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2743 auto cycle_sum(wallcycle_sum(cr, wcycle));
2747 auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2748 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2749 if (pme_gpu_task_enabled(pme))
2751 pme_gpu_get_timings(pme, &pme_gpu_timings);
2753 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2754 elapsed_time_over_all_ranks,
2759 if (EI_DYNAMICS(inputrec->eI))
2761 delta_t = inputrec->delta_t;
2766 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2767 elapsed_time_over_all_ranks,
2768 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2769 delta_t, nbfs, mflop);
2773 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2774 elapsed_time_over_all_ranks,
2775 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2776 delta_t, nbfs, mflop);
2781 void initialize_lambdas(FILE *fplog,
2782 const t_inputrec &ir,
2785 gmx::ArrayRef<real> lambda,
2788 /* TODO: Clean up initialization of fep_state and lambda in
2789 t_state. This function works, but could probably use a logic
2790 rewrite to keep all the different types of efep straight. */
2792 if ((ir.efep == efepNO) && (!ir.bSimTemp))
2797 const t_lambda *fep = ir.fepvals;
2800 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2801 if checkpoint is set -- a kludge is in for now
2805 for (int i = 0; i < efptNR; i++)
2808 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2809 if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2811 thisLambda = fep->init_lambda;
2815 thisLambda = fep->all_lambda[i][fep->init_fep_state];
2819 lambda[i] = thisLambda;
2821 if (lam0 != nullptr)
2823 lam0[i] = thisLambda;
2828 /* need to rescale control temperatures to match current state */
2829 for (int i = 0; i < ir.opts.ngtc; i++)
2831 if (ir.opts.ref_t[i] > 0)
2833 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2838 /* Send to the log the information on the current lambdas */
2839 if (fplog != nullptr)
2841 fprintf(fplog, "Initial vector of lambda components:[ ");
2842 for (const auto &l : lambda)
2844 fprintf(fplog, "%10.4f ", l);
2846 fprintf(fplog, "]\n");