2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
125 void print_time(FILE *out,
126 gmx_walltime_accounting_t walltime_accounting,
128 const t_inputrec *ir,
132 double dt, elapsed_seconds, time_per_step;
141 fputs(gmx::int64ToString(step).c_str(), out);
144 if ((step >= ir->nstlist))
146 double seconds_since_epoch = gmx_gettime();
147 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
149 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
155 finish = static_cast<time_t>(seconds_since_epoch + dt);
156 auto timebuf = gmx_ctime_r(&finish);
157 timebuf.erase(timebuf.find_first_of('\n'));
158 fputs(", will finish ", out);
159 fputs(timebuf.c_str(), out);
163 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
168 fprintf(out, " performance: %.1f ns/day ",
169 ir->delta_t/1000*24*60*60/time_per_step);
178 GMX_UNUSED_VALUE(cr);
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
192 time_t temp_time = static_cast<time_t>(the_time);
194 auto timebuf = gmx_ctime_r(&temp_time);
196 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
199 void print_start(FILE *fplog, const t_commrec *cr,
200 gmx_walltime_accounting_t walltime_accounting,
205 sprintf(buf, "Started %s", name);
206 print_date_and_time(fplog, cr->nodeid, buf,
207 walltime_accounting_get_start_time_stamp(walltime_accounting));
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
212 const int end = forceToAdd.size();
214 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216 for (int i = 0; i < end; i++)
218 rvec_inc(f[i], forceToAdd[i]);
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223 tensor vir_part, const t_graph *graph, const matrix box,
224 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
226 /* The short-range virial from surrounding boxes */
227 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
230 /* Calculate partial virial, for local atoms only, based on short range.
231 * Total virial is computed in global_stat, called from do_md
233 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
238 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
242 static void pull_potential_wrapper(const t_commrec *cr,
243 const t_inputrec *ir,
244 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245 gmx::ForceWithVirial *force,
246 const t_mdatoms *mdatoms,
247 gmx_enerdata_t *enerd,
250 gmx_wallcycle_t wcycle)
255 /* Calculate the center of mass forces, this requires communication,
256 * which is why pull_potential is called close to other communication.
258 wallcycle_start(wcycle, ewcPULLPOT);
259 set_pbc(&pbc, ir->ePBC, box);
261 enerd->term[F_COM_PULL] +=
262 pull_potential(ir->pull_work, mdatoms, &pbc,
263 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265 wallcycle_stop(wcycle, ewcPULLPOT);
268 static void pme_receive_force_ener(const t_commrec *cr,
269 gmx::ForceWithVirial *forceWithVirial,
270 gmx_enerdata_t *enerd,
271 gmx_wallcycle_t wcycle)
273 real e_q, e_lj, dvdl_q, dvdl_lj;
274 float cycles_ppdpme, cycles_seppme;
276 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
279 /* In case of node-splitting, the PP nodes receive the long-range
280 * forces, virial and energy from the PME nodes here.
282 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
285 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
287 enerd->term[F_COUL_RECIP] += e_q;
288 enerd->term[F_LJ_RECIP] += e_lj;
289 enerd->dvdl_lin[efptCOUL] += dvdl_q;
290 enerd->dvdl_lin[efptVDW] += dvdl_lj;
294 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
296 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
299 static void print_large_forces(FILE *fp,
307 real force2Tolerance = gmx::square(forceTolerance);
308 gmx::index numNonFinite = 0;
309 for (int i = 0; i < md->homenr; i++)
311 real force2 = norm2(f[i]);
312 bool nonFinite = !std::isfinite(force2);
313 if (force2 >= force2Tolerance || nonFinite)
315 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
317 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
324 if (numNonFinite > 0)
326 /* Note that with MPI this fatal call on one rank might interrupt
327 * the printing on other ranks. But we can only avoid that with
328 * an expensive MPI barrier that we would need at each step.
330 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
334 static void post_process_forces(const t_commrec *cr,
337 gmx_wallcycle_t wcycle,
338 const gmx_localtop_t *top,
342 gmx::ForceWithVirial *forceWithVirial,
344 const t_mdatoms *mdatoms,
345 const t_graph *graph,
346 const t_forcerec *fr,
347 const gmx_vsite_t *vsite,
350 if (fr->haveDirectVirialContributions)
352 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
356 /* Spread the mesh force on virtual sites to the other particles...
357 * This is parallellized. MPI communication is performed
358 * if the constructing atoms aren't local.
360 matrix virial = { { 0 } };
361 spread_vsite_f(vsite, x, fDirectVir, nullptr,
362 (flags & GMX_FORCE_VIRIAL) != 0, virial,
364 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365 forceWithVirial->addVirialContribution(virial);
368 if (flags & GMX_FORCE_VIRIAL)
370 /* Now add the forces, this is local */
371 sum_forces(f, forceWithVirial->force_);
373 /* Add the direct virial contributions */
374 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
379 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
384 if (fr->print_force >= 0)
386 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
390 static void do_nb_verlet(t_forcerec *fr,
391 const interaction_const_t *ic,
392 gmx_enerdata_t *enerd,
394 const Nbnxm::InteractionLocality ilocality,
398 gmx_wallcycle_t wcycle)
400 if (!(flags & GMX_FORCE_NONBONDED))
402 /* skip non-bonded calculation */
406 nonbonded_verlet_t *nbv = fr->nbv;
407 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
409 /* GPU kernel launch overhead is already timed separately */
410 if (fr->cutoff_scheme != ecutsVERLET)
412 gmx_incons("Invalid cut-off scheme passed!");
415 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
417 if (!bUsingGpuKernels)
419 /* When dynamic pair-list pruning is requested, we need to prune
420 * at nstlistPrune steps.
422 if (nbv->listParams->useDynamicPruning &&
423 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
425 /* Prune the pair-list beyond fr->ic->rlistPrune using
426 * the current coordinates of the atoms.
428 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
429 NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
430 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
433 wallcycle_sub_start(wcycle, ewcsNONBONDED);
436 NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
438 if (!bUsingGpuKernels)
440 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
444 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
448 const t_mdatoms *mdatoms,
451 gmx_enerdata_t *enerd,
454 gmx_wallcycle_t wcycle)
457 nb_kernel_data_t kernel_data;
459 real dvdl_nb[efptNR];
464 /* Add short-range interactions */
465 donb_flags |= GMX_NONBONDED_DO_SR;
467 /* Currently all group scheme kernels always calculate (shift-)forces */
468 if (flags & GMX_FORCE_FORCES)
470 donb_flags |= GMX_NONBONDED_DO_FORCE;
472 if (flags & GMX_FORCE_VIRIAL)
474 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
476 if (flags & GMX_FORCE_ENERGY)
478 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
481 kernel_data.flags = donb_flags;
482 kernel_data.lambda = lambda;
483 kernel_data.dvdl = dvdl_nb;
485 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
486 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
488 /* reset free energy components */
489 for (i = 0; i < efptNR; i++)
494 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
496 wallcycle_sub_start(wcycle, ewcsNONBONDED);
497 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
498 for (th = 0; th < nbl_lists->nnbl; th++)
502 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
503 x, f, fr, mdatoms, &kernel_data, nrnb);
505 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
508 if (fepvals->sc_alpha != 0)
510 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
511 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
515 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
516 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
519 /* If we do foreign lambda and we have soft-core interactions
520 * we have to recalculate the (non-linear) energies contributions.
522 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
524 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
525 kernel_data.lambda = lam_i;
526 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
527 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
528 /* Note that we add to kernel_data.dvdl, but ignore the result */
530 for (i = 0; i < enerd->n_lambda; i++)
532 for (j = 0; j < efptNR; j++)
534 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
536 reset_foreign_enerdata(enerd);
537 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
538 for (th = 0; th < nbl_lists->nnbl; th++)
542 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
543 x, f, fr, mdatoms, &kernel_data, nrnb);
545 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
548 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
549 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
553 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
556 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
558 return nbv != nullptr && nbv->bUseGPU;
561 static inline void clear_rvecs_omp(int n, rvec v[])
563 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
565 /* Note that we would like to avoid this conditional by putting it
566 * into the omp pragma instead, but then we still take the full
567 * omp parallel for overhead (at least with gcc5).
571 for (int i = 0; i < n; i++)
578 #pragma omp parallel for num_threads(nth) schedule(static)
579 for (int i = 0; i < n; i++)
586 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
588 * \param groupOptions Group options, containing T-coupling options
590 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
592 real nrdfCoupled = 0;
593 real nrdfUncoupled = 0;
594 real kineticEnergy = 0;
595 for (int g = 0; g < groupOptions.ngtc; g++)
597 if (groupOptions.tau_t[g] >= 0)
599 nrdfCoupled += groupOptions.nrdf[g];
600 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
604 nrdfUncoupled += groupOptions.nrdf[g];
608 /* This conditional with > also catches nrdf=0 */
609 if (nrdfCoupled > nrdfUncoupled)
611 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
619 /*! \brief This routine checks that the potential energy is finite.
621 * Always checks that the potential energy is finite. If step equals
622 * inputrec.init_step also checks that the magnitude of the potential energy
623 * is reasonable. Terminates with a fatal error when a check fails.
624 * Note that passing this check does not guarantee finite forces,
625 * since those use slightly different arithmetics. But in most cases
626 * there is just a narrow coordinate range where forces are not finite
627 * and energies are finite.
629 * \param[in] step The step number, used for checking and printing
630 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
631 * \param[in] inputrec The input record
633 static void checkPotentialEnergyValidity(int64_t step,
634 const gmx_enerdata_t &enerd,
635 const t_inputrec &inputrec)
637 /* Threshold valid for comparing absolute potential energy against
638 * the kinetic energy. Normally one should not consider absolute
639 * potential energy values, but with a factor of one million
640 * we should never get false positives.
642 constexpr real c_thresholdFactor = 1e6;
644 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
645 real averageKineticEnergy = 0;
646 /* We only check for large potential energy at the initial step,
647 * because that is by far the most likely step for this too occur
648 * and because computing the average kinetic energy is not free.
649 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
650 * before they become NaN.
652 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
654 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
657 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
658 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
660 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
663 energyIsNotFinite ? "not finite" : "extremely high",
665 enerd.term[F_COUL_SR],
666 energyIsNotFinite ? "non-finite" : "very high",
667 energyIsNotFinite ? " or Nan" : "");
671 /*! \brief Compute forces and/or energies for special algorithms
673 * The intention is to collect all calls to algorithms that compute
674 * forces on local atoms only and that do not contribute to the local
675 * virial sum (but add their virial contribution separately).
676 * Eventually these should likely all become ForceProviders.
677 * Within this function the intention is to have algorithms that do
678 * global communication at the end, so global barriers within the MD loop
679 * are as close together as possible.
681 * \param[in] fplog The log file
682 * \param[in] cr The communication record
683 * \param[in] inputrec The input record
684 * \param[in] awh The Awh module (nullptr if none in use).
685 * \param[in] enforcedRotation Enforced rotation module.
686 * \param[in] step The current MD step
687 * \param[in] t The current time
688 * \param[in,out] wcycle Wallcycle accounting struct
689 * \param[in,out] forceProviders Pointer to a list of force providers
690 * \param[in] box The unit cell
691 * \param[in] x The coordinates
692 * \param[in] mdatoms Per atom properties
693 * \param[in] lambda Array of free-energy lambda values
694 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
695 * \param[in,out] forceWithVirial Force and virial buffers
696 * \param[in,out] enerd Energy buffer
697 * \param[in,out] ed Essential dynamics pointer
698 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
700 * \todo Remove bNS, which is used incorrectly.
701 * \todo Convert all other algorithms called here to ForceProviders.
704 computeSpecialForces(FILE *fplog,
706 const t_inputrec *inputrec,
708 gmx_enfrot *enforcedRotation,
711 gmx_wallcycle_t wcycle,
712 ForceProviders *forceProviders,
714 gmx::ArrayRef<const gmx::RVec> x,
715 const t_mdatoms *mdatoms,
718 gmx::ForceWithVirial *forceWithVirial,
719 gmx_enerdata_t *enerd,
723 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
725 /* NOTE: Currently all ForceProviders only provide forces.
726 * When they also provide energies, remove this conditional.
730 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
731 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
733 /* Collect forces from modules */
734 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
737 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
739 pull_potential_wrapper(cr, inputrec, box, x,
741 mdatoms, enerd, lambda, t,
746 enerd->term[F_COM_PULL] +=
747 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
749 t, step, wcycle, fplog);
753 rvec *f = as_rvec_array(forceWithVirial->force_.data());
755 /* Add the forces from enforced rotation potentials (if any) */
758 wallcycle_start(wcycle, ewcROTadd);
759 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
760 wallcycle_stop(wcycle, ewcROTadd);
765 /* Note that since init_edsam() is called after the initialization
766 * of forcerec, edsam doesn't request the noVirSum force buffer.
767 * Thus if no other algorithm (e.g. PME) requires it, the forces
768 * here will contribute to the virial.
770 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
773 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
774 if (inputrec->bIMD && computeForces)
776 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
780 /*! \brief Launch the prepare_step and spread stages of PME GPU.
782 * \param[in] pmedata The PME structure
783 * \param[in] box The box matrix
784 * \param[in] x Coordinate array
785 * \param[in] flags Force flags
786 * \param[in] pmeFlags PME flags
787 * \param[in] wcycle The wallcycle structure
789 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
794 gmx_wallcycle_t wcycle)
796 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
797 pme_gpu_launch_spread(pmedata, x, wcycle);
800 /*! \brief Launch the FFT and gather stages of PME GPU
802 * This function only implements setting the output forces (no accumulation).
804 * \param[in] pmedata The PME structure
805 * \param[in] wcycle The wallcycle structure
807 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
808 gmx_wallcycle_t wcycle)
810 pme_gpu_launch_complex_transforms(pmedata, wcycle);
811 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
815 * Polling wait for either of the PME or nonbonded GPU tasks.
817 * Instead of a static order in waiting for GPU tasks, this function
818 * polls checking which of the two tasks completes first, and does the
819 * associated force buffer reduction overlapped with the other task.
820 * By doing that, unlike static scheduling order, it can always overlap
821 * one of the reductions, regardless of the GPU task completion order.
823 * \param[in] nbv Nonbonded verlet structure
824 * \param[in,out] pmedata PME module data
825 * \param[in,out] force Force array to reduce task outputs into.
826 * \param[in,out] forceWithVirial Force and virial buffers
827 * \param[in,out] fshift Shift force output vector results are reduced into
828 * \param[in,out] enerd Energy data structure results are reduced into
829 * \param[in] flags Force flags
830 * \param[in] pmeFlags PME flags
831 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
832 * \param[in] wcycle The wallcycle structure
834 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
836 gmx::ArrayRefWithPadding<gmx::RVec> *force,
837 gmx::ForceWithVirial *forceWithVirial,
839 gmx_enerdata_t *enerd,
843 gmx_wallcycle_t wcycle)
845 bool isPmeGpuDone = false;
846 bool isNbGpuDone = false;
849 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
851 while (!isPmeGpuDone || !isNbGpuDone)
855 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
856 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
861 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
862 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
863 isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
865 Nbnxm::AtomLocality::Local,
867 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
868 fshift, completionType);
869 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
870 // To get the call count right, when the task finished we
871 // issue a start/stop.
872 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
873 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
876 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
877 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
879 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
880 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
887 * Launch the dynamic rolling pruning GPU task.
889 * We currently alternate local/non-local list pruning in odd-even steps
890 * (only pruning every second step without DD).
892 * \param[in] cr The communication record
893 * \param[in] nbv Nonbonded verlet structure
894 * \param[in] inputrec The input record
895 * \param[in] step The current MD step
897 static inline void launchGpuRollingPruning(const t_commrec *cr,
898 const nonbonded_verlet_t *nbv,
899 const t_inputrec *inputrec,
902 /* We should not launch the rolling pruning kernel at a search
903 * step or just before search steps, since that's useless.
904 * Without domain decomposition we prune at even steps.
905 * With domain decomposition we alternate local and non-local
906 * pruning at even and odd steps.
908 int numRollingParts = nbv->listParams->numRollingParts;
909 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
910 int stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
911 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
912 if (stepWithCurrentList > 0 &&
913 stepWithCurrentList < inputrec->nstlist - 1 &&
914 (stepIsEven || DOMAINDECOMP(cr)))
916 Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
917 stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
922 static void do_force_cutsVERLET(FILE *fplog,
924 const gmx_multisim_t *ms,
925 const t_inputrec *inputrec,
927 gmx_enfrot *enforcedRotation,
930 gmx_wallcycle_t wcycle,
931 const gmx_localtop_t *top,
932 const gmx_groups_t * /* groups */,
933 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
935 gmx::ArrayRefWithPadding<gmx::RVec> force,
937 const t_mdatoms *mdatoms,
938 gmx_enerdata_t *enerd, t_fcdata *fcd,
942 gmx::PpForceWorkload *ppForceWorkload,
943 interaction_const_t *ic,
944 const gmx_vsite_t *vsite,
949 const DDBalanceRegionHandler &ddBalanceRegionHandler)
953 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
954 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
955 rvec vzero, box_diag;
956 float cycles_pme, cycles_wait_gpu;
957 nonbonded_verlet_t *nbv = fr->nbv;
959 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
960 bNS = ((flags & GMX_FORCE_NS) != 0);
961 bFillGrid = (bNS && bStateChanged);
962 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
963 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
964 bUseGPU = fr->nbv->bUseGPU;
965 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
967 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
968 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
969 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
970 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
971 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
972 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
973 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
974 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
976 /* At a search step we need to start the first balancing region
977 * somewhere early inside the step after communication during domain
978 * decomposition (and not during the previous step as usual).
982 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
988 const int homenr = mdatoms->homenr;
990 clear_mat(vir_force);
992 if (DOMAINDECOMP(cr))
994 cg1 = cr->dd->globalAtomGroupIndices.size();
1007 update_forcerec(fr, box);
1009 if (inputrecNeedMutot(inputrec))
1011 /* Calculate total (local) dipole moment in a temporary common array.
1012 * This makes it possible to sum them over nodes faster.
1014 calc_mu(start, homenr,
1015 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1020 if (fr->ePBC != epbcNONE)
1022 /* Compute shift vectors every step,
1023 * because of pressure coupling or box deformation!
1025 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1027 calc_shifts(box, fr->shift_vec);
1032 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1033 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1035 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1037 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1041 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1042 fr->shift_vec, nbv->nbat);
1045 if (!thisRankHasDuty(cr, DUTY_PME))
1047 /* Send particle coordinates to the pme nodes.
1048 * Since this is only implemented for domain decomposition
1049 * and domain decomposition does not use the graph,
1050 * we do not need to worry about shifting.
1052 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1053 lambda[efptCOUL], lambda[efptVDW],
1054 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1057 #endif /* GMX_MPI */
1061 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1064 /* do gridding for pair search */
1067 if (graph && bStateChanged)
1069 /* Calculate intramolecular shift vectors to make molecules whole */
1070 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1074 box_diag[XX] = box[XX][XX];
1075 box_diag[YY] = box[YY][YY];
1076 box_diag[ZZ] = box[ZZ][ZZ];
1078 wallcycle_start(wcycle, ewcNS);
1079 if (!DOMAINDECOMP(cr))
1081 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1082 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1084 nullptr, 0, mdatoms->homenr, -1,
1085 fr->cginfo, x.unpaddedArrayRef(),
1087 nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1089 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1093 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1094 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1095 fr->cginfo, x.unpaddedArrayRef(),
1096 nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1098 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1101 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1103 wallcycle_stop(wcycle, ewcNS);
1106 /* initialize the GPU atom data and copy shift vector */
1109 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1110 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1114 Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1117 Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1119 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1121 if (bNS && fr->gpuBonded)
1123 /* Now we put all atoms on the grid, we can assign bonded
1124 * interactions to the GPU, where the grid order is
1125 * needed. Also the xq, f and fshift device buffers have
1126 * been reallocated if needed, so the bonded code can
1127 * learn about them. */
1128 // TODO the xq, f, and fshift buffers are now shared
1129 // resources, so they should be maintained by a
1130 // higher-level object than the nb module.
1131 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1133 Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1134 Nbnxm::gpu_get_f(nbv->gpu_nbv),
1135 Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1136 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1139 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1142 /* do local pair search */
1145 nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
1147 wallcycle_start_nocount(wcycle, ewcNS);
1148 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1149 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1151 nbv->listParams->rlistOuter,
1152 nbv->min_ci_balanced,
1154 Nbnxm::InteractionLocality::Local,
1155 nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1157 pairlistSet.outerListCreationStep = step;
1158 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1160 nbnxnPrepareListForDynamicPruning(&pairlistSet);
1162 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1166 /* initialize local pair-list on the GPU */
1167 Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1168 pairlistSet.nblGpu[0],
1169 Nbnxm::InteractionLocality::Local);
1171 wallcycle_stop(wcycle, ewcNS);
1175 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1176 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1182 ddBalanceRegionHandler.openBeforeForceComputationGpu();
1184 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1186 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1187 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1188 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1190 // bonded work not split into separate local and non-local, so with DD
1191 // we can only launch the kernel after non-local coordinates have been received.
1192 if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1194 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1195 fr->gpuBonded->launchKernels(fr, flags, box);
1196 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1199 /* launch local nonbonded work on GPU */
1200 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1201 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1202 step, nrnb, wcycle);
1203 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1204 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1209 // In PME GPU and mixed mode we launch FFT / gather after the
1210 // X copy/transform to allow overlap as well as after the GPU NB
1211 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1212 // the nonbonded kernel.
1213 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1216 /* Communicate coordinates and sum dipole if necessary +
1217 do non-local pair search */
1218 if (DOMAINDECOMP(cr))
1220 nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
1224 wallcycle_start_nocount(wcycle, ewcNS);
1225 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1227 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1229 nbv->listParams->rlistOuter,
1230 nbv->min_ci_balanced,
1232 Nbnxm::InteractionLocality::NonLocal,
1233 nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1235 pairlistSet.outerListCreationStep = step;
1236 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1238 nbnxnPrepareListForDynamicPruning(&pairlistSet);
1240 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1242 if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
1244 /* initialize non-local pair-list on the GPU */
1245 Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1246 pairlistSet.nblGpu[0],
1247 Nbnxm::InteractionLocality::NonLocal);
1249 wallcycle_stop(wcycle, ewcNS);
1253 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1255 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1256 FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1262 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1264 /* launch non-local nonbonded tasks on GPU */
1265 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1266 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1267 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1269 if (ppForceWorkload->haveGpuBondedWork)
1271 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1272 fr->gpuBonded->launchKernels(fr, flags, box);
1273 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1276 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1277 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1278 step, nrnb, wcycle);
1279 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1281 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1287 /* launch D2H copy-back F */
1288 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1289 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1290 if (DOMAINDECOMP(cr))
1292 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1293 flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1295 Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1296 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1297 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1299 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1301 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1302 fr->gpuBonded->launchEnergyTransfer();
1303 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1305 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1308 if (bStateChanged && inputrecNeedMutot(inputrec))
1312 gmx_sumd(2*DIM, mu, cr);
1314 ddBalanceRegionHandler.reopenRegionCpu();
1317 for (i = 0; i < 2; i++)
1319 for (j = 0; j < DIM; j++)
1321 fr->mu_tot[i][j] = mu[i*DIM + j];
1325 if (fr->efep == efepNO)
1327 copy_rvec(fr->mu_tot[0], mu_tot);
1331 for (j = 0; j < DIM; j++)
1334 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1335 lambda[efptCOUL]*fr->mu_tot[1][j];
1339 /* Reset energies */
1340 reset_enerdata(enerd);
1341 clear_rvecs(SHIFTS, fr->fshift);
1343 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1345 wallcycle_start(wcycle, ewcPPDURINGPME);
1346 dd_force_flop_start(cr->dd, nrnb);
1351 wallcycle_start(wcycle, ewcROT);
1352 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1353 wallcycle_stop(wcycle, ewcROT);
1356 /* Temporary solution until all routines take PaddedRVecVector */
1357 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1359 /* Start the force cycle counter.
1360 * Note that a different counter is used for dynamic load balancing.
1362 wallcycle_start(wcycle, ewcFORCE);
1364 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1367 /* If we need to compute the virial, we might need a separate
1368 * force buffer for algorithms for which the virial is calculated
1369 * directly, such as PME.
1371 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1373 forceRef = *fr->forceBufferForDirectVirialContributions;
1375 /* TODO: update comment
1376 * We only compute forces on local atoms. Note that vsites can
1377 * spread to non-local atoms, but that part of the buffer is
1378 * cleared separately in the vsite spreading code.
1380 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1382 /* Clear the short- and long-range forces */
1383 clear_rvecs_omp(fr->natoms_force_constr, f);
1386 /* forceWithVirial uses the local atom range only */
1387 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1389 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1391 clear_pull_forces(inputrec->pull_work);
1394 /* We calculate the non-bonded forces, when done on the CPU, here.
1395 * We do this before calling do_force_lowlevel, because in that
1396 * function, the listed forces are calculated before PME, which
1397 * does communication. With this order, non-bonded and listed
1398 * force calculation imbalance can be balanced out by the domain
1399 * decomposition load balancing.
1404 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1405 step, nrnb, wcycle);
1408 if (fr->efep != efepNO)
1410 /* Calculate the local and non-local free energy interactions here.
1411 * Happens here on the CPU both with and without GPU.
1413 if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
1415 do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
1416 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1417 inputrec->fepvals, lambda,
1418 enerd, flags, nrnb, wcycle);
1421 if (DOMAINDECOMP(cr) &&
1422 fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1424 do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
1425 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1426 inputrec->fepvals, lambda,
1427 enerd, flags, nrnb, wcycle);
1433 if (DOMAINDECOMP(cr))
1435 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1436 step, nrnb, wcycle);
1439 const Nbnxm::InteractionLocality iloc =
1440 (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1442 /* Add all the non-bonded force to the normal force array.
1443 * This can be split into a local and a non-local part when overlapping
1444 * communication with calculation with domain decomposition.
1446 wallcycle_stop(wcycle, ewcFORCE);
1448 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1450 wallcycle_start_nocount(wcycle, ewcFORCE);
1452 /* if there are multiple fshift output buffers reduce them */
1453 if ((flags & GMX_FORCE_VIRIAL) &&
1454 nbv->grp[iloc].nbl_lists.nnbl > 1)
1456 /* This is not in a subcounter because it takes a
1457 negligible and constant-sized amount of time */
1458 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1463 /* update QMMMrec, if necessary */
1466 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1469 /* Compute the bonded and non-bonded energies and optionally forces */
1470 do_force_lowlevel(fr, inputrec, &(top->idef),
1471 cr, ms, nrnb, wcycle, mdatoms,
1472 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1473 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1475 &cycles_pme, ddBalanceRegionHandler);
1477 wallcycle_stop(wcycle, ewcFORCE);
1479 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1481 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1482 flags, &forceWithVirial, enerd,
1487 /* wait for non-local forces (or calculate in emulation mode) */
1488 if (DOMAINDECOMP(cr))
1492 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1493 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1494 flags, Nbnxm::AtomLocality::NonLocal,
1495 ppForceWorkload->haveGpuBondedWork,
1496 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1498 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1502 wallcycle_start_nocount(wcycle, ewcFORCE);
1503 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1504 step, nrnb, wcycle);
1505 wallcycle_stop(wcycle, ewcFORCE);
1508 /* skip the reduction if there was no non-local work to do */
1509 if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
1511 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1512 nbv->nbat, f, wcycle);
1517 if (DOMAINDECOMP(cr))
1519 /* We are done with the CPU compute.
1520 * We will now communicate the non-local forces.
1521 * If we use a GPU this will overlap with GPU work, so in that case
1522 * we do not close the DD force balancing region here.
1524 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1528 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1532 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1533 // an alternating wait/reduction scheme.
1534 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1535 if (alternateGpuWait)
1537 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1540 if (!alternateGpuWait && useGpuPme)
1542 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1545 /* Wait for local GPU NB outputs on the non-alternating wait path */
1546 if (!alternateGpuWait && bUseGPU)
1548 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1549 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1550 * but even with a step of 0.1 ms the difference is less than 1%
1553 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1555 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1556 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1557 flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1558 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1560 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1562 if (ddBalanceRegionHandler.useBalancingRegion())
1564 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1565 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1567 /* We measured few cycles, it could be that the kernel
1568 * and transfer finished earlier and there was no actual
1569 * wait time, only API call overhead.
1570 * Then the actual time could be anywhere between 0 and
1571 * cycles_wait_est. We will use half of cycles_wait_est.
1573 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1575 ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1579 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1581 // NOTE: emulation kernel is not included in the balancing region,
1582 // but emulation mode does not target performance anyway
1583 wallcycle_start_nocount(wcycle, ewcFORCE);
1584 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1585 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1586 step, nrnb, wcycle);
1587 wallcycle_stop(wcycle, ewcFORCE);
1592 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1597 /* now clear the GPU outputs while we finish the step on the CPU */
1598 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1599 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1600 Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1602 /* Is dynamic pair-list pruning activated? */
1603 if (nbv->listParams->useDynamicPruning)
1605 launchGpuRollingPruning(cr, nbv, inputrec, step);
1607 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1608 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1611 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1613 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1614 // in principle this should be included in the DD balancing region,
1615 // but generally it is infrequent so we'll omit it for the sake of
1617 fr->gpuBonded->accumulateEnergyTerms(enerd);
1618 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1620 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1621 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1622 fr->gpuBonded->clearEnergies();
1623 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1624 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1627 /* Do the nonbonded GPU (or emulation) force buffer reduction
1628 * on the non-alternating path. */
1629 if (bUseOrEmulGPU && !alternateGpuWait)
1631 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1632 nbv->nbat, f, wcycle);
1634 if (DOMAINDECOMP(cr))
1636 dd_force_flop_stop(cr->dd, nrnb);
1641 /* If we have NoVirSum forces, but we do not calculate the virial,
1642 * we sum fr->f_novirsum=f later.
1644 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1646 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1647 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1650 if (flags & GMX_FORCE_VIRIAL)
1652 /* Calculation of the virial must be done after vsites! */
1653 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1654 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1658 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1660 /* In case of node-splitting, the PP nodes receive the long-range
1661 * forces, virial and energy from the PME nodes here.
1663 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1668 post_process_forces(cr, step, nrnb, wcycle,
1669 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1670 vir_force, mdatoms, graph, fr, vsite,
1674 if (flags & GMX_FORCE_ENERGY)
1676 /* Sum the potential energy terms from group contributions */
1677 sum_epot(&(enerd->grpp), enerd->term);
1679 if (!EI_TPI(inputrec->eI))
1681 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1686 static void do_force_cutsGROUP(FILE *fplog,
1687 const t_commrec *cr,
1688 const gmx_multisim_t *ms,
1689 const t_inputrec *inputrec,
1691 gmx_enfrot *enforcedRotation,
1694 gmx_wallcycle_t wcycle,
1695 gmx_localtop_t *top,
1696 const gmx_groups_t *groups,
1697 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1699 gmx::ArrayRefWithPadding<gmx::RVec> force,
1701 const t_mdatoms *mdatoms,
1702 gmx_enerdata_t *enerd,
1707 const gmx_vsite_t *vsite,
1712 const DDBalanceRegionHandler &ddBalanceRegionHandler)
1716 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1720 const int start = 0;
1721 const int homenr = mdatoms->homenr;
1723 clear_mat(vir_force);
1726 if (DOMAINDECOMP(cr))
1728 cg1 = cr->dd->globalAtomGroupIndices.size();
1739 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1740 bNS = ((flags & GMX_FORCE_NS) != 0);
1741 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1742 bFillGrid = (bNS && bStateChanged);
1743 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1744 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1748 update_forcerec(fr, box);
1750 if (inputrecNeedMutot(inputrec))
1752 /* Calculate total (local) dipole moment in a temporary common array.
1753 * This makes it possible to sum them over nodes faster.
1755 calc_mu(start, homenr,
1756 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1761 if (fr->ePBC != epbcNONE)
1763 /* Compute shift vectors every step,
1764 * because of pressure coupling or box deformation!
1766 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1768 calc_shifts(box, fr->shift_vec);
1773 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1774 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1775 inc_nrnb(nrnb, eNR_CGCM, homenr);
1776 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1778 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1780 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1785 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1786 inc_nrnb(nrnb, eNR_CGCM, homenr);
1789 if (bCalcCGCM && gmx_debug_at)
1791 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1795 if (!thisRankHasDuty(cr, DUTY_PME))
1797 /* Send particle coordinates to the pme nodes.
1798 * Since this is only implemented for domain decomposition
1799 * and domain decomposition does not use the graph,
1800 * we do not need to worry about shifting.
1802 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1803 lambda[efptCOUL], lambda[efptVDW],
1804 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1807 #endif /* GMX_MPI */
1809 /* Communicate coordinates and sum dipole if necessary */
1810 if (DOMAINDECOMP(cr))
1812 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1814 /* No GPU support, no move_x overlap, so reopen the balance region here */
1815 ddBalanceRegionHandler.reopenRegionCpu();
1818 if (inputrecNeedMutot(inputrec))
1824 gmx_sumd(2*DIM, mu, cr);
1826 ddBalanceRegionHandler.reopenRegionCpu();
1828 for (i = 0; i < 2; i++)
1830 for (j = 0; j < DIM; j++)
1832 fr->mu_tot[i][j] = mu[i*DIM + j];
1836 if (fr->efep == efepNO)
1838 copy_rvec(fr->mu_tot[0], mu_tot);
1842 for (j = 0; j < DIM; j++)
1845 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1850 /* Reset energies */
1851 reset_enerdata(enerd);
1852 clear_rvecs(SHIFTS, fr->fshift);
1856 wallcycle_start(wcycle, ewcNS);
1858 if (graph && bStateChanged)
1860 /* Calculate intramolecular shift vectors to make molecules whole */
1861 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1864 /* Do the actual neighbour searching */
1866 groups, top, mdatoms,
1867 cr, nrnb, bFillGrid);
1869 wallcycle_stop(wcycle, ewcNS);
1872 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1874 wallcycle_start(wcycle, ewcPPDURINGPME);
1875 dd_force_flop_start(cr->dd, nrnb);
1880 wallcycle_start(wcycle, ewcROT);
1881 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1882 wallcycle_stop(wcycle, ewcROT);
1885 /* Temporary solution until all routines take PaddedRVecVector */
1886 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1888 /* Start the force cycle counter.
1889 * Note that a different counter is used for dynamic load balancing.
1891 wallcycle_start(wcycle, ewcFORCE);
1893 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1896 /* If we need to compute the virial, we might need a separate
1897 * force buffer for algorithms for which the virial is calculated
1898 * separately, such as PME.
1900 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1902 forceRef = *fr->forceBufferForDirectVirialContributions;
1904 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1907 /* Clear the short- and long-range forces */
1908 clear_rvecs(fr->natoms_force_constr, f);
1911 /* forceWithVirial might need the full force atom range */
1912 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1914 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1916 clear_pull_forces(inputrec->pull_work);
1919 /* update QMMMrec, if necessary */
1922 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1925 /* Compute the bonded and non-bonded energies and optionally forces */
1926 do_force_lowlevel(fr, inputrec, &(top->idef),
1927 cr, ms, nrnb, wcycle, mdatoms,
1928 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1929 box, inputrec->fepvals, lambda,
1930 graph, &(top->excls), fr->mu_tot,
1932 &cycles_pme, ddBalanceRegionHandler);
1934 wallcycle_stop(wcycle, ewcFORCE);
1936 if (DOMAINDECOMP(cr))
1938 dd_force_flop_stop(cr->dd, nrnb);
1940 ddBalanceRegionHandler.closeAfterForceComputationCpu();
1943 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1945 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1946 flags, &forceWithVirial, enerd,
1951 /* Communicate the forces */
1952 if (DOMAINDECOMP(cr))
1954 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1955 /* Do we need to communicate the separate force array
1956 * for terms that do not contribute to the single sum virial?
1957 * Position restraints and electric fields do not introduce
1958 * inter-cg forces, only full electrostatics methods do.
1959 * When we do not calculate the virial, fr->f_novirsum = f,
1960 * so we have already communicated these forces.
1962 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1963 (flags & GMX_FORCE_VIRIAL))
1965 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1969 /* If we have NoVirSum forces, but we do not calculate the virial,
1970 * we sum fr->f_novirsum=f later.
1972 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1974 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1975 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1978 if (flags & GMX_FORCE_VIRIAL)
1980 /* Calculation of the virial must be done after vsites! */
1981 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1982 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1986 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1988 /* In case of node-splitting, the PP nodes receive the long-range
1989 * forces, virial and energy from the PME nodes here.
1991 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1996 post_process_forces(cr, step, nrnb, wcycle,
1997 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1998 vir_force, mdatoms, graph, fr, vsite,
2002 if (flags & GMX_FORCE_ENERGY)
2004 /* Sum the potential energy terms from group contributions */
2005 sum_epot(&(enerd->grpp), enerd->term);
2007 if (!EI_TPI(inputrec->eI))
2009 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2015 void do_force(FILE *fplog,
2016 const t_commrec *cr,
2017 const gmx_multisim_t *ms,
2018 const t_inputrec *inputrec,
2020 gmx_enfrot *enforcedRotation,
2023 gmx_wallcycle_t wcycle,
2024 gmx_localtop_t *top,
2025 const gmx_groups_t *groups,
2027 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
2029 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
2031 const t_mdatoms *mdatoms,
2032 gmx_enerdata_t *enerd,
2034 gmx::ArrayRef<real> lambda,
2037 gmx::PpForceWorkload *ppForceWorkload,
2038 const gmx_vsite_t *vsite,
2043 const DDBalanceRegionHandler &ddBalanceRegionHandler)
2045 /* modify force flag if not doing nonbonded */
2046 if (!fr->bNonbonded)
2048 flags &= ~GMX_FORCE_NONBONDED;
2051 switch (inputrec->cutoff_scheme)
2054 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2055 awh, enforcedRotation, step, nrnb, wcycle,
2062 lambda.data(), graph,
2069 ddBalanceRegionHandler);
2072 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2073 awh, enforcedRotation, step, nrnb, wcycle,
2080 lambda.data(), graph,
2084 ddBalanceRegionHandler);
2087 gmx_incons("Invalid cut-off scheme passed!");
2090 /* In case we don't have constraints and are using GPUs, the next balancing
2091 * region starts here.
2092 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2093 * virial calculation and COM pulling, is not thus not included in
2094 * the balance timing, which is ok as most tasks do communication.
2096 ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2100 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2101 const t_inputrec *ir, const t_mdatoms *md,
2104 int i, m, start, end;
2106 real dt = ir->delta_t;
2110 /* We need to allocate one element extra, since we might use
2111 * (unaligned) 4-wide SIMD loads to access rvec entries.
2113 snew(savex, state->natoms + 1);
2120 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2121 start, md->homenr, end);
2123 /* Do a first constrain to reset particles... */
2124 step = ir->init_step;
2127 char buf[STEPSTRSIZE];
2128 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2129 gmx_step_str(step, buf));
2133 /* constrain the current position */
2134 constr->apply(TRUE, FALSE,
2136 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2138 state->lambda[efptBONDED], &dvdl_dum,
2139 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2142 /* constrain the inital velocity, and save it */
2143 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2144 constr->apply(TRUE, FALSE,
2146 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2148 state->lambda[efptBONDED], &dvdl_dum,
2149 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2151 /* constrain the inital velocities at t-dt/2 */
2152 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2154 auto x = makeArrayRef(state->x).subArray(start, end);
2155 auto v = makeArrayRef(state->v).subArray(start, end);
2156 for (i = start; (i < end); i++)
2158 for (m = 0; (m < DIM); m++)
2160 /* Reverse the velocity */
2162 /* Store the position at t-dt in buf */
2163 savex[i][m] = x[i][m] + dt*v[i][m];
2166 /* Shake the positions at t=-dt with the positions at t=0
2167 * as reference coordinates.
2171 char buf[STEPSTRSIZE];
2172 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2173 gmx_step_str(step, buf));
2176 constr->apply(TRUE, FALSE,
2178 state->x.rvec_array(), savex, nullptr,
2180 state->lambda[efptBONDED], &dvdl_dum,
2181 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2183 for (i = start; i < end; i++)
2185 for (m = 0; m < DIM; m++)
2187 /* Re-reverse the velocities */
2197 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2198 double *enerout, double *virout)
2200 double enersum, virsum;
2201 double invscale, invscale2, invscale3;
2202 double r, ea, eb, ec, pa, pb, pc, pd;
2207 invscale = 1.0/scale;
2208 invscale2 = invscale*invscale;
2209 invscale3 = invscale*invscale2;
2211 /* Following summation derived from cubic spline definition,
2212 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2213 * the cubic spline. We first calculate the negative of the
2214 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2215 * add the more standard, abrupt cutoff correction to that result,
2216 * yielding the long-range correction for a switched function. We
2217 * perform both the pressure and energy loops at the same time for
2218 * simplicity, as the computational cost is low. */
2222 /* Since the dispersion table has been scaled down a factor
2223 * 6.0 and the repulsion a factor 12.0 to compensate for the
2224 * c6/c12 parameters inside nbfp[] being scaled up (to save
2225 * flops in kernels), we need to correct for this.
2236 for (ri = rstart; ri < rend; ++ri)
2240 eb = 2.0*invscale2*r;
2244 pb = 3.0*invscale2*r;
2245 pc = 3.0*invscale*r*r;
2248 /* this "8" is from the packing in the vdwtab array - perhaps
2249 should be defined? */
2251 offset = 8*ri + offstart;
2252 y0 = vdwtab[offset];
2253 f = vdwtab[offset+1];
2254 g = vdwtab[offset+2];
2255 h = vdwtab[offset+3];
2257 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2258 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2260 *enerout = 4.0*M_PI*enersum*tabfactor;
2261 *virout = 4.0*M_PI*virsum*tabfactor;
2264 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2266 double eners[2], virs[2], enersum, virsum;
2267 double r0, rc3, rc9;
2269 real scale, *vdwtab;
2271 fr->enershiftsix = 0;
2272 fr->enershifttwelve = 0;
2273 fr->enerdiffsix = 0;
2274 fr->enerdifftwelve = 0;
2276 fr->virdifftwelve = 0;
2278 const interaction_const_t *ic = fr->ic;
2280 if (eDispCorr != edispcNO)
2282 for (i = 0; i < 2; i++)
2287 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2288 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2289 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2290 (ic->vdwtype == evdwSHIFT) ||
2291 (ic->vdwtype == evdwSWITCH))
2293 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2294 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2295 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2298 "With dispersion correction rvdw-switch can not be zero "
2299 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2302 /* TODO This code depends on the logic in tables.c that
2303 constructs the table layout, which should be made
2304 explicit in future cleanup. */
2305 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2306 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2307 scale = fr->dispersionCorrectionTable->scale;
2308 vdwtab = fr->dispersionCorrectionTable->data;
2310 /* Round the cut-offs to exact table values for precision */
2311 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2312 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2314 /* The code below has some support for handling force-switching, i.e.
2315 * when the force (instead of potential) is switched over a limited
2316 * region. This leads to a constant shift in the potential inside the
2317 * switching region, which we can handle by adding a constant energy
2318 * term in the force-switch case just like when we do potential-shift.
2320 * For now this is not enabled, but to keep the functionality in the
2321 * code we check separately for switch and shift. When we do force-switch
2322 * the shifting point is rvdw_switch, while it is the cutoff when we
2323 * have a classical potential-shift.
2325 * For a pure potential-shift the potential has a constant shift
2326 * all the way out to the cutoff, and that is it. For other forms
2327 * we need to calculate the constant shift up to the point where we
2328 * start modifying the potential.
2330 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2336 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2337 (ic->vdwtype == evdwSHIFT))
2339 /* Determine the constant energy shift below rvdw_switch.
2340 * Table has a scale factor since we have scaled it down to compensate
2341 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2343 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2344 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2346 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2348 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2349 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2352 /* Add the constant part from 0 to rvdw_switch.
2353 * This integration from 0 to rvdw_switch overcounts the number
2354 * of interactions by 1, as it also counts the self interaction.
2355 * We will correct for this later.
2357 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2358 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2360 /* Calculate the contribution in the range [r0,r1] where we
2361 * modify the potential. For a pure potential-shift modifier we will
2362 * have ri0==ri1, and there will not be any contribution here.
2364 for (i = 0; i < 2; i++)
2368 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2369 eners[i] -= enersum;
2373 /* Alright: Above we compensated by REMOVING the parts outside r0
2374 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2376 * Regardless of whether r0 is the point where we start switching,
2377 * or the cutoff where we calculated the constant shift, we include
2378 * all the parts we are missing out to infinity from r0 by
2379 * calculating the analytical dispersion correction.
2381 eners[0] += -4.0*M_PI/(3.0*rc3);
2382 eners[1] += 4.0*M_PI/(9.0*rc9);
2383 virs[0] += 8.0*M_PI/rc3;
2384 virs[1] += -16.0*M_PI/(3.0*rc9);
2386 else if (ic->vdwtype == evdwCUT ||
2387 EVDW_PME(ic->vdwtype) ||
2388 ic->vdwtype == evdwUSER)
2390 if (ic->vdwtype == evdwUSER && fplog)
2393 "WARNING: using dispersion correction with user tables\n");
2396 /* Note that with LJ-PME, the dispersion correction is multiplied
2397 * by the difference between the actual C6 and the value of C6
2398 * that would produce the combination rule.
2399 * This means the normal energy and virial difference formulas
2403 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2405 /* Contribution beyond the cut-off */
2406 eners[0] += -4.0*M_PI/(3.0*rc3);
2407 eners[1] += 4.0*M_PI/(9.0*rc9);
2408 if (ic->vdw_modifier == eintmodPOTSHIFT)
2410 /* Contribution within the cut-off */
2411 eners[0] += -4.0*M_PI/(3.0*rc3);
2412 eners[1] += 4.0*M_PI/(3.0*rc9);
2414 /* Contribution beyond the cut-off */
2415 virs[0] += 8.0*M_PI/rc3;
2416 virs[1] += -16.0*M_PI/(3.0*rc9);
2421 "Dispersion correction is not implemented for vdw-type = %s",
2422 evdw_names[ic->vdwtype]);
2425 /* When we deprecate the group kernels the code below can go too */
2426 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2428 /* Calculate self-interaction coefficient (assuming that
2429 * the reciprocal-space contribution is constant in the
2430 * region that contributes to the self-interaction).
2432 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2434 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2435 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2438 fr->enerdiffsix = eners[0];
2439 fr->enerdifftwelve = eners[1];
2440 /* The 0.5 is due to the Gromacs definition of the virial */
2441 fr->virdiffsix = 0.5*virs[0];
2442 fr->virdifftwelve = 0.5*virs[1];
2446 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2447 const matrix box, real lambda, tensor pres, tensor virial,
2448 real *prescorr, real *enercorr, real *dvdlcorr)
2450 gmx_bool bCorrAll, bCorrPres;
2451 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2461 if (ir->eDispCorr != edispcNO)
2463 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2464 ir->eDispCorr == edispcAllEnerPres);
2465 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2466 ir->eDispCorr == edispcAllEnerPres);
2468 invvol = 1/det(box);
2471 /* Only correct for the interactions with the inserted molecule */
2472 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2477 dens = fr->numAtomsForDispersionCorrection*invvol;
2478 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2481 if (ir->efep == efepNO)
2483 avcsix = fr->avcsix[0];
2484 avctwelve = fr->avctwelve[0];
2488 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2489 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2492 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2493 *enercorr += avcsix*enerdiff;
2495 if (ir->efep != efepNO)
2497 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2501 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2502 *enercorr += avctwelve*enerdiff;
2503 if (fr->efep != efepNO)
2505 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2511 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2512 if (ir->eDispCorr == edispcAllEnerPres)
2514 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2516 /* The factor 2 is because of the Gromacs virial definition */
2517 spres = -2.0*invvol*svir*PRESFAC;
2519 for (m = 0; m < DIM; m++)
2521 virial[m][m] += svir;
2522 pres[m][m] += spres;
2527 /* Can't currently control when it prints, for now, just print when degugging */
2532 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2538 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2539 *enercorr, spres, svir);
2543 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2547 if (fr->efep != efepNO)
2549 *dvdlcorr += dvdlambda;
2554 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2555 const gmx_mtop_t *mtop, rvec x[],
2561 if (bFirst && fplog)
2563 fprintf(fplog, "Removing pbc first time\n");
2568 for (const gmx_molblock_t &molb : mtop->molblock)
2570 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2571 if (moltype.atoms.nr == 1 ||
2572 (!bFirst && moltype.cgs.nr == 1))
2574 /* Just one atom or charge group in the molecule, no PBC required */
2575 as += molb.nmol*moltype.atoms.nr;
2579 mk_graph_moltype(moltype, graph);
2581 for (mol = 0; mol < molb.nmol; mol++)
2583 mk_mshift(fplog, graph, ePBC, box, x+as);
2585 shift_self(graph, box, x+as);
2586 /* The molecule is whole now.
2587 * We don't need the second mk_mshift call as in do_pbc_first,
2588 * since we no longer need this graph.
2591 as += moltype.atoms.nr;
2599 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2600 const gmx_mtop_t *mtop, rvec x[])
2602 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2605 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2606 const gmx_mtop_t *mtop, rvec x[])
2608 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2611 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2614 nth = gmx_omp_nthreads_get(emntDefault);
2616 #pragma omp parallel for num_threads(nth) schedule(static)
2617 for (t = 0; t < nth; t++)
2621 size_t natoms = x.size();
2622 size_t offset = (natoms*t )/nth;
2623 size_t len = (natoms*(t + 1))/nth - offset;
2624 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2626 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2630 // TODO This can be cleaned up a lot, and move back to runner.cpp
2631 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2632 const t_inputrec *inputrec,
2633 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2634 gmx_walltime_accounting_t walltime_accounting,
2635 nonbonded_verlet_t *nbv,
2636 const gmx_pme_t *pme,
2637 gmx_bool bWriteStat)
2639 t_nrnb *nrnb_tot = nullptr;
2641 double nbfs = 0, mflop = 0;
2642 double elapsed_time,
2643 elapsed_time_over_all_ranks,
2644 elapsed_time_over_all_threads,
2645 elapsed_time_over_all_threads_over_all_ranks;
2646 /* Control whether it is valid to print a report. Only the
2647 simulation master may print, but it should not do so if the run
2648 terminated e.g. before a scheduled reset step. This is
2649 complicated by the fact that PME ranks are unaware of the
2650 reason why they were sent a pmerecvqxFINISH. To avoid
2651 communication deadlocks, we always do the communication for the
2652 report, even if we've decided not to write the report, because
2653 how long it takes to finish the run is not important when we've
2654 decided not to report on the simulation performance.
2656 Further, we only report performance for dynamical integrators,
2657 because those are the only ones for which we plan to
2658 consider doing any optimizations. */
2659 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2661 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2663 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2664 printReport = false;
2671 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2672 cr->mpi_comm_mysim);
2680 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2681 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2685 /* reduce elapsed_time over all MPI ranks in the current simulation */
2686 MPI_Allreduce(&elapsed_time,
2687 &elapsed_time_over_all_ranks,
2688 1, MPI_DOUBLE, MPI_SUM,
2689 cr->mpi_comm_mysim);
2690 elapsed_time_over_all_ranks /= cr->nnodes;
2691 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2692 * current simulation. */
2693 MPI_Allreduce(&elapsed_time_over_all_threads,
2694 &elapsed_time_over_all_threads_over_all_ranks,
2695 1, MPI_DOUBLE, MPI_SUM,
2696 cr->mpi_comm_mysim);
2701 elapsed_time_over_all_ranks = elapsed_time;
2702 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2707 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2714 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2716 print_dd_statistics(cr, inputrec, fplog);
2719 /* TODO Move the responsibility for any scaling by thread counts
2720 * to the code that handled the thread region, so that there's a
2721 * mechanism to keep cycle counting working during the transition
2722 * to task parallelism. */
2723 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2724 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2725 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2726 auto cycle_sum(wallcycle_sum(cr, wcycle));
2730 auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2731 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2732 if (pme_gpu_task_enabled(pme))
2734 pme_gpu_get_timings(pme, &pme_gpu_timings);
2736 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2737 elapsed_time_over_all_ranks,
2742 if (EI_DYNAMICS(inputrec->eI))
2744 delta_t = inputrec->delta_t;
2749 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2750 elapsed_time_over_all_ranks,
2751 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2752 delta_t, nbfs, mflop);
2756 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2757 elapsed_time_over_all_ranks,
2758 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2759 delta_t, nbfs, mflop);
2764 void initialize_lambdas(FILE *fplog,
2765 const t_inputrec &ir,
2768 gmx::ArrayRef<real> lambda,
2771 /* TODO: Clean up initialization of fep_state and lambda in
2772 t_state. This function works, but could probably use a logic
2773 rewrite to keep all the different types of efep straight. */
2775 if ((ir.efep == efepNO) && (!ir.bSimTemp))
2780 const t_lambda *fep = ir.fepvals;
2783 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2784 if checkpoint is set -- a kludge is in for now
2788 for (int i = 0; i < efptNR; i++)
2791 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2792 if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2794 thisLambda = fep->init_lambda;
2798 thisLambda = fep->all_lambda[i][fep->init_fep_state];
2802 lambda[i] = thisLambda;
2804 if (lam0 != nullptr)
2806 lam0[i] = thisLambda;
2811 /* need to rescale control temperatures to match current state */
2812 for (int i = 0; i < ir.opts.ngtc; i++)
2814 if (ir.opts.ref_t[i] > 0)
2816 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2821 /* Send to the log the information on the current lambdas */
2822 if (fplog != nullptr)
2824 fprintf(fplog, "Initial vector of lambda components:[ ");
2825 for (const auto &l : lambda)
2827 fprintf(fplog, "%10.4f ", l);
2829 fprintf(fplog, "]\n");