6f3f2e3150de955cc8cad4a1eff14544cfbf06bb
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
47
48 #include <array>
49
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
117
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
123
124
125 void print_time(FILE                     *out,
126                 gmx_walltime_accounting_t walltime_accounting,
127                 int64_t                   step,
128                 const t_inputrec         *ir,
129                 const t_commrec          *cr)
130 {
131     time_t finish;
132     double dt, elapsed_seconds, time_per_step;
133
134 #if !GMX_THREAD_MPI
135     if (!PAR(cr))
136 #endif
137     {
138         fprintf(out, "\r");
139     }
140     fputs("step ", out);
141     fputs(gmx::int64ToString(step).c_str(), out);
142     fflush(out);
143
144     if ((step >= ir->nstlist))
145     {
146         double seconds_since_epoch = gmx_gettime();
147         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
149         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
150
151         if (ir->nsteps >= 0)
152         {
153             if (dt >= 300)
154             {
155                 finish = static_cast<time_t>(seconds_since_epoch + dt);
156                 auto timebuf = gmx_ctime_r(&finish);
157                 timebuf.erase(timebuf.find_first_of('\n'));
158                 fputs(", will finish ", out);
159                 fputs(timebuf.c_str(), out);
160             }
161             else
162             {
163                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
164             }
165         }
166         else
167         {
168             fprintf(out, " performance: %.1f ns/day    ",
169                     ir->delta_t/1000*24*60*60/time_per_step);
170         }
171     }
172 #if !GMX_THREAD_MPI
173     if (PAR(cr))
174     {
175         fprintf(out, "\n");
176     }
177 #else
178     GMX_UNUSED_VALUE(cr);
179 #endif
180
181     fflush(out);
182 }
183
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185                          double the_time)
186 {
187     if (!fplog)
188     {
189         return;
190     }
191
192     time_t temp_time = static_cast<time_t>(the_time);
193
194     auto   timebuf = gmx_ctime_r(&temp_time);
195
196     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
197 }
198
199 void print_start(FILE *fplog, const t_commrec *cr,
200                  gmx_walltime_accounting_t walltime_accounting,
201                  const char *name)
202 {
203     char buf[STRLEN];
204
205     sprintf(buf, "Started %s", name);
206     print_date_and_time(fplog, cr->nodeid, buf,
207                         walltime_accounting_get_start_time_stamp(walltime_accounting));
208 }
209
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
211 {
212     const int      end = forceToAdd.size();
213
214     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216     for (int i = 0; i < end; i++)
217     {
218         rvec_inc(f[i], forceToAdd[i]);
219     }
220 }
221
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223                         tensor vir_part, const t_graph *graph, const matrix box,
224                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
225 {
226     /* The short-range virial from surrounding boxes */
227     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
229
230     /* Calculate partial virial, for local atoms only, based on short range.
231      * Total virial is computed in global_stat, called from do_md
232      */
233     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
235
236     if (debug)
237     {
238         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
239     }
240 }
241
242 static void pull_potential_wrapper(const t_commrec *cr,
243                                    const t_inputrec *ir,
244                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245                                    gmx::ForceWithVirial *force,
246                                    const t_mdatoms *mdatoms,
247                                    gmx_enerdata_t *enerd,
248                                    const real *lambda,
249                                    double t,
250                                    gmx_wallcycle_t wcycle)
251 {
252     t_pbc  pbc;
253     real   dvdl;
254
255     /* Calculate the center of mass forces, this requires communication,
256      * which is why pull_potential is called close to other communication.
257      */
258     wallcycle_start(wcycle, ewcPULLPOT);
259     set_pbc(&pbc, ir->ePBC, box);
260     dvdl                     = 0;
261     enerd->term[F_COM_PULL] +=
262         pull_potential(ir->pull_work, mdatoms, &pbc,
263                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265     wallcycle_stop(wcycle, ewcPULLPOT);
266 }
267
268 static void pme_receive_force_ener(const t_commrec      *cr,
269                                    gmx::ForceWithVirial *forceWithVirial,
270                                    gmx_enerdata_t       *enerd,
271                                    gmx_wallcycle_t       wcycle)
272 {
273     real   e_q, e_lj, dvdl_q, dvdl_lj;
274     float  cycles_ppdpme, cycles_seppme;
275
276     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
278
279     /* In case of node-splitting, the PP nodes receive the long-range
280      * forces, virial and energy from the PME nodes here.
281      */
282     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
283     dvdl_q  = 0;
284     dvdl_lj = 0;
285     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
286                       &cycles_seppme);
287     enerd->term[F_COUL_RECIP] += e_q;
288     enerd->term[F_LJ_RECIP]   += e_lj;
289     enerd->dvdl_lin[efptCOUL] += dvdl_q;
290     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
291
292     if (wcycle)
293     {
294         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
295     }
296     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
297 }
298
299 static void print_large_forces(FILE            *fp,
300                                const t_mdatoms *md,
301                                const t_commrec *cr,
302                                int64_t          step,
303                                real             forceTolerance,
304                                const rvec      *x,
305                                const rvec      *f)
306 {
307     real           force2Tolerance = gmx::square(forceTolerance);
308     gmx::index     numNonFinite    = 0;
309     for (int i = 0; i < md->homenr; i++)
310     {
311         real force2    = norm2(f[i]);
312         bool nonFinite = !std::isfinite(force2);
313         if (force2 >= force2Tolerance || nonFinite)
314         {
315             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
316                     step,
317                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
318         }
319         if (nonFinite)
320         {
321             numNonFinite++;
322         }
323     }
324     if (numNonFinite > 0)
325     {
326         /* Note that with MPI this fatal call on one rank might interrupt
327          * the printing on other ranks. But we can only avoid that with
328          * an expensive MPI barrier that we would need at each step.
329          */
330         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
331     }
332 }
333
334 static void post_process_forces(const t_commrec           *cr,
335                                 int64_t                    step,
336                                 t_nrnb                    *nrnb,
337                                 gmx_wallcycle_t            wcycle,
338                                 const gmx_localtop_t      *top,
339                                 const matrix               box,
340                                 const rvec                 x[],
341                                 rvec                       f[],
342                                 gmx::ForceWithVirial      *forceWithVirial,
343                                 tensor                     vir_force,
344                                 const t_mdatoms           *mdatoms,
345                                 const t_graph             *graph,
346                                 const t_forcerec          *fr,
347                                 const gmx_vsite_t         *vsite,
348                                 int                        flags)
349 {
350     if (fr->haveDirectVirialContributions)
351     {
352         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
353
354         if (vsite)
355         {
356             /* Spread the mesh force on virtual sites to the other particles...
357              * This is parallellized. MPI communication is performed
358              * if the constructing atoms aren't local.
359              */
360             matrix virial = { { 0 } };
361             spread_vsite_f(vsite, x, fDirectVir, nullptr,
362                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
363                            nrnb,
364                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365             forceWithVirial->addVirialContribution(virial);
366         }
367
368         if (flags & GMX_FORCE_VIRIAL)
369         {
370             /* Now add the forces, this is local */
371             sum_forces(f, forceWithVirial->force_);
372
373             /* Add the direct virial contributions */
374             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
376
377             if (debug)
378             {
379                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
380             }
381         }
382     }
383
384     if (fr->print_force >= 0)
385     {
386         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
387     }
388 }
389
390 static void do_nb_verlet(t_forcerec                       *fr,
391                          const interaction_const_t        *ic,
392                          gmx_enerdata_t                   *enerd,
393                          const int                         flags,
394                          const Nbnxm::InteractionLocality  ilocality,
395                          const int                         clearF,
396                          const int64_t                     step,
397                          t_nrnb                           *nrnb,
398                          gmx_wallcycle_t                   wcycle)
399 {
400     if (!(flags & GMX_FORCE_NONBONDED))
401     {
402         /* skip non-bonded calculation */
403         return;
404     }
405
406     nonbonded_verlet_t *nbv  = fr->nbv;
407
408     /* GPU kernel launch overhead is already timed separately */
409     if (fr->cutoff_scheme != ecutsVERLET)
410     {
411         gmx_incons("Invalid cut-off scheme passed!");
412     }
413
414     if (!nbv->useGpu())
415     {
416         /* When dynamic pair-list  pruning is requested, we need to prune
417          * at nstlistPrune steps.
418          */
419         if (nbv->listParams->useDynamicPruning &&
420             nbnxnIsDynamicPairlistPruningStep(*nbv, ilocality, step))
421         {
422             /* Prune the pair-list beyond fr->ic->rlistPrune using
423              * the current coordinates of the atoms.
424              */
425             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
426             nbv->dispatchPruneKernel(ilocality, fr->shift_vec);
427             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
428         }
429
430         wallcycle_sub_start(wcycle, ewcsNONBONDED);
431     }
432
433     NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
434
435     if (!nbv->useGpu())
436     {
437         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
438     }
439 }
440
441 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
442 {
443     return nbv != nullptr && nbv->useGpu();
444 }
445
446 static inline void clear_rvecs_omp(int n, rvec v[])
447 {
448     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
449
450     /* Note that we would like to avoid this conditional by putting it
451      * into the omp pragma instead, but then we still take the full
452      * omp parallel for overhead (at least with gcc5).
453      */
454     if (nth == 1)
455     {
456         for (int i = 0; i < n; i++)
457         {
458             clear_rvec(v[i]);
459         }
460     }
461     else
462     {
463 #pragma omp parallel for num_threads(nth) schedule(static)
464         for (int i = 0; i < n; i++)
465         {
466             clear_rvec(v[i]);
467         }
468     }
469 }
470
471 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
472  *
473  * \param groupOptions  Group options, containing T-coupling options
474  */
475 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
476 {
477     real nrdfCoupled   = 0;
478     real nrdfUncoupled = 0;
479     real kineticEnergy = 0;
480     for (int g = 0; g < groupOptions.ngtc; g++)
481     {
482         if (groupOptions.tau_t[g] >= 0)
483         {
484             nrdfCoupled   += groupOptions.nrdf[g];
485             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
486         }
487         else
488         {
489             nrdfUncoupled += groupOptions.nrdf[g];
490         }
491     }
492
493     /* This conditional with > also catches nrdf=0 */
494     if (nrdfCoupled > nrdfUncoupled)
495     {
496         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
497     }
498     else
499     {
500         return 0;
501     }
502 }
503
504 /*! \brief This routine checks that the potential energy is finite.
505  *
506  * Always checks that the potential energy is finite. If step equals
507  * inputrec.init_step also checks that the magnitude of the potential energy
508  * is reasonable. Terminates with a fatal error when a check fails.
509  * Note that passing this check does not guarantee finite forces,
510  * since those use slightly different arithmetics. But in most cases
511  * there is just a narrow coordinate range where forces are not finite
512  * and energies are finite.
513  *
514  * \param[in] step      The step number, used for checking and printing
515  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
516  * \param[in] inputrec  The input record
517  */
518 static void checkPotentialEnergyValidity(int64_t               step,
519                                          const gmx_enerdata_t &enerd,
520                                          const t_inputrec     &inputrec)
521 {
522     /* Threshold valid for comparing absolute potential energy against
523      * the kinetic energy. Normally one should not consider absolute
524      * potential energy values, but with a factor of one million
525      * we should never get false positives.
526      */
527     constexpr real c_thresholdFactor = 1e6;
528
529     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
530     real           averageKineticEnergy = 0;
531     /* We only check for large potential energy at the initial step,
532      * because that is by far the most likely step for this too occur
533      * and because computing the average kinetic energy is not free.
534      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
535      * before they become NaN.
536      */
537     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
538     {
539         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
540     }
541
542     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
543                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
544     {
545         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
546                   step,
547                   enerd.term[F_EPOT],
548                   energyIsNotFinite ? "not finite" : "extremely high",
549                   enerd.term[F_LJ],
550                   enerd.term[F_COUL_SR],
551                   energyIsNotFinite ? "non-finite" : "very high",
552                   energyIsNotFinite ? " or Nan" : "");
553     }
554 }
555
556 /*! \brief Compute forces and/or energies for special algorithms
557  *
558  * The intention is to collect all calls to algorithms that compute
559  * forces on local atoms only and that do not contribute to the local
560  * virial sum (but add their virial contribution separately).
561  * Eventually these should likely all become ForceProviders.
562  * Within this function the intention is to have algorithms that do
563  * global communication at the end, so global barriers within the MD loop
564  * are as close together as possible.
565  *
566  * \param[in]     fplog            The log file
567  * \param[in]     cr               The communication record
568  * \param[in]     inputrec         The input record
569  * \param[in]     awh              The Awh module (nullptr if none in use).
570  * \param[in]     enforcedRotation Enforced rotation module.
571  * \param[in]     step             The current MD step
572  * \param[in]     t                The current time
573  * \param[in,out] wcycle           Wallcycle accounting struct
574  * \param[in,out] forceProviders   Pointer to a list of force providers
575  * \param[in]     box              The unit cell
576  * \param[in]     x                The coordinates
577  * \param[in]     mdatoms          Per atom properties
578  * \param[in]     lambda           Array of free-energy lambda values
579  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
580  * \param[in,out] forceWithVirial  Force and virial buffers
581  * \param[in,out] enerd            Energy buffer
582  * \param[in,out] ed               Essential dynamics pointer
583  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
584  *
585  * \todo Remove bNS, which is used incorrectly.
586  * \todo Convert all other algorithms called here to ForceProviders.
587  */
588 static void
589 computeSpecialForces(FILE                          *fplog,
590                      const t_commrec               *cr,
591                      const t_inputrec              *inputrec,
592                      gmx::Awh                      *awh,
593                      gmx_enfrot                    *enforcedRotation,
594                      int64_t                        step,
595                      double                         t,
596                      gmx_wallcycle_t                wcycle,
597                      ForceProviders                *forceProviders,
598                      matrix                         box,
599                      gmx::ArrayRef<const gmx::RVec> x,
600                      const t_mdatoms               *mdatoms,
601                      real                          *lambda,
602                      int                            forceFlags,
603                      gmx::ForceWithVirial          *forceWithVirial,
604                      gmx_enerdata_t                *enerd,
605                      gmx_edsam                     *ed,
606                      gmx_bool                       bNS)
607 {
608     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
609
610     /* NOTE: Currently all ForceProviders only provide forces.
611      *       When they also provide energies, remove this conditional.
612      */
613     if (computeForces)
614     {
615         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
616         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
617
618         /* Collect forces from modules */
619         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
620     }
621
622     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
623     {
624         pull_potential_wrapper(cr, inputrec, box, x,
625                                forceWithVirial,
626                                mdatoms, enerd, lambda, t,
627                                wcycle);
628
629         if (awh)
630         {
631             enerd->term[F_COM_PULL] +=
632                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
633                                                   forceWithVirial,
634                                                   t, step, wcycle, fplog);
635         }
636     }
637
638     rvec *f = as_rvec_array(forceWithVirial->force_.data());
639
640     /* Add the forces from enforced rotation potentials (if any) */
641     if (inputrec->bRot)
642     {
643         wallcycle_start(wcycle, ewcROTadd);
644         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
645         wallcycle_stop(wcycle, ewcROTadd);
646     }
647
648     if (ed)
649     {
650         /* Note that since init_edsam() is called after the initialization
651          * of forcerec, edsam doesn't request the noVirSum force buffer.
652          * Thus if no other algorithm (e.g. PME) requires it, the forces
653          * here will contribute to the virial.
654          */
655         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
656     }
657
658     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
659     if (inputrec->bIMD && computeForces)
660     {
661         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
662     }
663 }
664
665 /*! \brief Launch the prepare_step and spread stages of PME GPU.
666  *
667  * \param[in]  pmedata       The PME structure
668  * \param[in]  box           The box matrix
669  * \param[in]  x             Coordinate array
670  * \param[in]  flags         Force flags
671  * \param[in]  pmeFlags      PME flags
672  * \param[in]  wcycle        The wallcycle structure
673  */
674 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
675                                       matrix          box,
676                                       rvec            x[],
677                                       int             flags,
678                                       int             pmeFlags,
679                                       gmx_wallcycle_t wcycle)
680 {
681     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
682     pme_gpu_launch_spread(pmedata, x, wcycle);
683 }
684
685 /*! \brief Launch the FFT and gather stages of PME GPU
686  *
687  * This function only implements setting the output forces (no accumulation).
688  *
689  * \param[in]  pmedata        The PME structure
690  * \param[in]  wcycle         The wallcycle structure
691  */
692 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
693                                      gmx_wallcycle_t   wcycle)
694 {
695     pme_gpu_launch_complex_transforms(pmedata, wcycle);
696     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
697 }
698
699 /*! \brief
700  *  Polling wait for either of the PME or nonbonded GPU tasks.
701  *
702  * Instead of a static order in waiting for GPU tasks, this function
703  * polls checking which of the two tasks completes first, and does the
704  * associated force buffer reduction overlapped with the other task.
705  * By doing that, unlike static scheduling order, it can always overlap
706  * one of the reductions, regardless of the GPU task completion order.
707  *
708  * \param[in]     nbv              Nonbonded verlet structure
709  * \param[in,out] pmedata          PME module data
710  * \param[in,out] force            Force array to reduce task outputs into.
711  * \param[in,out] forceWithVirial  Force and virial buffers
712  * \param[in,out] fshift           Shift force output vector results are reduced into
713  * \param[in,out] enerd            Energy data structure results are reduced into
714  * \param[in]     flags            Force flags
715  * \param[in]     pmeFlags         PME flags
716  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
717  * \param[in]     wcycle           The wallcycle structure
718  */
719 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
720                                         gmx_pme_t                           *pmedata,
721                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
722                                         gmx::ForceWithVirial                *forceWithVirial,
723                                         rvec                                 fshift[],
724                                         gmx_enerdata_t                      *enerd,
725                                         int                                  flags,
726                                         int                                  pmeFlags,
727                                         bool                                 haveOtherWork,
728                                         gmx_wallcycle_t                      wcycle)
729 {
730     bool isPmeGpuDone = false;
731     bool isNbGpuDone  = false;
732
733
734     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
735
736     while (!isPmeGpuDone || !isNbGpuDone)
737     {
738         if (!isPmeGpuDone)
739         {
740             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
741             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
742         }
743
744         if (!isNbGpuDone)
745         {
746             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
747             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
748             isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
749                                                      flags,
750                                                      Nbnxm::AtomLocality::Local,
751                                                      haveOtherWork,
752                                                      enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
753                                                      fshift, completionType);
754             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
755             // To get the call count right, when the task finished we
756             // issue a start/stop.
757             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
758             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
759             if (isNbGpuDone)
760             {
761                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
762                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
763
764                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
765                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
766             }
767         }
768     }
769 }
770
771 /*! \brief
772  *  Launch the dynamic rolling pruning GPU task.
773  *
774  *  We currently alternate local/non-local list pruning in odd-even steps
775  *  (only pruning every second step without DD).
776  *
777  * \param[in]     cr               The communication record
778  * \param[in]     nbv              Nonbonded verlet structure
779  * \param[in]     inputrec         The input record
780  * \param[in]     step             The current MD step
781  */
782 static inline void launchGpuRollingPruning(const t_commrec          *cr,
783                                            const nonbonded_verlet_t *nbv,
784                                            const t_inputrec         *inputrec,
785                                            const int64_t             step)
786 {
787     /* We should not launch the rolling pruning kernel at a search
788      * step or just before search steps, since that's useless.
789      * Without domain decomposition we prune at even steps.
790      * With domain decomposition we alternate local and non-local
791      * pruning at even and odd steps.
792      */
793     int  numRollingParts     = nbv->listParams->numRollingParts;
794     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
795     int  stepWithCurrentList = nbnxnNumStepsWithPairlist(*nbv, Nbnxm::InteractionLocality::Local, step);
796     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
797     if (stepWithCurrentList > 0 &&
798         stepWithCurrentList < inputrec->nstlist - 1 &&
799         (stepIsEven || havePPDomainDecomposition(cr)))
800     {
801         Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
802                                            stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
803                                            numRollingParts);
804     }
805 }
806
807 static void do_force_cutsVERLET(FILE *fplog,
808                                 const t_commrec *cr,
809                                 const gmx_multisim_t *ms,
810                                 const t_inputrec *inputrec,
811                                 gmx::Awh *awh,
812                                 gmx_enfrot *enforcedRotation,
813                                 int64_t step,
814                                 t_nrnb *nrnb,
815                                 gmx_wallcycle_t wcycle,
816                                 const gmx_localtop_t *top,
817                                 const gmx_groups_t * /* groups */,
818                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
819                                 history_t *hist,
820                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
821                                 tensor vir_force,
822                                 const t_mdatoms *mdatoms,
823                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
824                                 real *lambda,
825                                 t_graph *graph,
826                                 t_forcerec *fr,
827                                 gmx::PpForceWorkload *ppForceWorkload,
828                                 interaction_const_t *ic,
829                                 const gmx_vsite_t *vsite,
830                                 rvec mu_tot,
831                                 double t,
832                                 gmx_edsam *ed,
833                                 const int flags,
834                                 const DDBalanceRegionHandler &ddBalanceRegionHandler)
835 {
836     int                 cg1, i, j;
837     double              mu[2*DIM];
838     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
839     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
840     rvec                vzero, box_diag;
841     float               cycles_pme, cycles_wait_gpu;
842     nonbonded_verlet_t *nbv = fr->nbv;
843
844     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
845     bNS           = ((flags & GMX_FORCE_NS) != 0);
846     bFillGrid     = (bNS && bStateChanged);
847     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
848     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
849     bUseGPU       = fr->nbv->useGpu();
850     bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
851
852     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
853     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
854     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
855         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
856     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
857         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
858         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
859         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
860
861     /* At a search step we need to start the first balancing region
862      * somewhere early inside the step after communication during domain
863      * decomposition (and not during the previous step as usual).
864      */
865     if (bNS)
866     {
867         ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
868     }
869
870     cycles_wait_gpu = 0;
871
872     const int start  = 0;
873     const int homenr = mdatoms->homenr;
874
875     clear_mat(vir_force);
876
877     if (DOMAINDECOMP(cr))
878     {
879         cg1 = cr->dd->globalAtomGroupIndices.size();
880     }
881     else
882     {
883         cg1 = top->cgs.nr;
884     }
885     if (fr->n_tpi > 0)
886     {
887         cg1--;
888     }
889
890     if (bStateChanged)
891     {
892         update_forcerec(fr, box);
893
894         if (inputrecNeedMutot(inputrec))
895         {
896             /* Calculate total (local) dipole moment in a temporary common array.
897              * This makes it possible to sum them over nodes faster.
898              */
899             calc_mu(start, homenr,
900                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
901                     mu, mu+DIM);
902         }
903     }
904
905     if (fr->ePBC != epbcNONE)
906     {
907         /* Compute shift vectors every step,
908          * because of pressure coupling or box deformation!
909          */
910         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
911         {
912             calc_shifts(box, fr->shift_vec);
913         }
914
915         if (bCalcCGCM)
916         {
917             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
918             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
919         }
920         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
921         {
922             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
923         }
924     }
925
926     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
927                                  fr->shift_vec, nbv->nbat);
928
929 #if GMX_MPI
930     if (!thisRankHasDuty(cr, DUTY_PME))
931     {
932         /* Send particle coordinates to the pme nodes.
933          * Since this is only implemented for domain decomposition
934          * and domain decomposition does not use the graph,
935          * we do not need to worry about shifting.
936          */
937         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
938                                  lambda[efptCOUL], lambda[efptVDW],
939                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
940                                  step, wcycle);
941     }
942 #endif /* GMX_MPI */
943
944     if (useGpuPme)
945     {
946         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
947     }
948
949     /* do gridding for pair search */
950     if (bNS)
951     {
952         if (graph && bStateChanged)
953         {
954             /* Calculate intramolecular shift vectors to make molecules whole */
955             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
956         }
957
958         clear_rvec(vzero);
959         box_diag[XX] = box[XX][XX];
960         box_diag[YY] = box[YY][YY];
961         box_diag[ZZ] = box[ZZ][ZZ];
962
963         wallcycle_start(wcycle, ewcNS);
964         if (!DOMAINDECOMP(cr))
965         {
966             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
967             nbnxn_put_on_grid(nbv, box,
968                               0, vzero, box_diag,
969                               nullptr, 0, mdatoms->homenr, -1,
970                               fr->cginfo, x.unpaddedArrayRef(),
971                               0, nullptr);
972             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
973         }
974         else
975         {
976             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
977             nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd),
978                                        fr->cginfo, x.unpaddedArrayRef());
979             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
980         }
981
982         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
983
984         wallcycle_stop(wcycle, ewcNS);
985     }
986
987     /* initialize the GPU atom data and copy shift vector */
988     if (bUseGPU)
989     {
990         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
991         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
992
993         if (bNS)
994         {
995             Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
996         }
997
998         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
999
1000         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1001
1002         if (bNS && fr->gpuBonded)
1003         {
1004             /* Now we put all atoms on the grid, we can assign bonded
1005              * interactions to the GPU, where the grid order is
1006              * needed. Also the xq, f and fshift device buffers have
1007              * been reallocated if needed, so the bonded code can
1008              * learn about them. */
1009             // TODO the xq, f, and fshift buffers are now shared
1010             // resources, so they should be maintained by a
1011             // higher-level object than the nb module.
1012             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1013                                                                   top->idef,
1014                                                                   Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1015                                                                   Nbnxm::gpu_get_f(nbv->gpu_nbv),
1016                                                                   Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1017             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1018         }
1019
1020         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1021     }
1022
1023     /* do local pair search */
1024     if (bNS)
1025     {
1026         wallcycle_start_nocount(wcycle, ewcNS);
1027         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1028         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1029         nbv->constructPairlist(Nbnxm::InteractionLocality::Local,
1030                                &top->excls, step, nrnb);
1031         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1032         wallcycle_stop(wcycle, ewcNS);
1033     }
1034     else
1035     {
1036         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1037                                         FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1038                                         nbv->nbat, wcycle);
1039     }
1040
1041     if (bUseGPU)
1042     {
1043         ddBalanceRegionHandler.openBeforeForceComputationGpu();
1044
1045         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1046
1047         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1048         Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1049         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1050
1051         // bonded work not split into separate local and non-local, so with DD
1052         // we can only launch the kernel after non-local coordinates have been received.
1053         if (ppForceWorkload->haveGpuBondedWork && !havePPDomainDecomposition(cr))
1054         {
1055             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1056             fr->gpuBonded->launchKernels(fr, flags, box);
1057             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1058         }
1059
1060         /* launch local nonbonded work on GPU */
1061         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1062         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1063                      step, nrnb, wcycle);
1064         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1065         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1066     }
1067
1068     if (useGpuPme)
1069     {
1070         // In PME GPU and mixed mode we launch FFT / gather after the
1071         // X copy/transform to allow overlap as well as after the GPU NB
1072         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1073         // the nonbonded kernel.
1074         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1075     }
1076
1077     /* Communicate coordinates and sum dipole if necessary +
1078        do non-local pair search */
1079     if (havePPDomainDecomposition(cr))
1080     {
1081         if (bNS)
1082         {
1083             wallcycle_start_nocount(wcycle, ewcNS);
1084             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1085             /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1086             nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal,
1087                                    &top->excls, step, nrnb);
1088             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1089             wallcycle_stop(wcycle, ewcNS);
1090         }
1091         else
1092         {
1093             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1094
1095             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1096                                             FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1097                                             nbv->nbat, wcycle);
1098         }
1099
1100         if (bUseGPU)
1101         {
1102             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1103
1104             /* launch non-local nonbonded tasks on GPU */
1105             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1106             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1107             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1108
1109             if (ppForceWorkload->haveGpuBondedWork)
1110             {
1111                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1112                 fr->gpuBonded->launchKernels(fr, flags, box);
1113                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1114             }
1115
1116             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1117             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1118                          step, nrnb, wcycle);
1119             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1120
1121             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1122         }
1123     }
1124
1125     if (bUseGPU)
1126     {
1127         /* launch D2H copy-back F */
1128         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1129         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1130         if (havePPDomainDecomposition(cr))
1131         {
1132             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1133                                       flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1134         }
1135         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1136                                   flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1137         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1138
1139         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1140         {
1141             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1142             fr->gpuBonded->launchEnergyTransfer();
1143             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1144         }
1145         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1146     }
1147
1148     if (bStateChanged && inputrecNeedMutot(inputrec))
1149     {
1150         if (PAR(cr))
1151         {
1152             gmx_sumd(2*DIM, mu, cr);
1153
1154             ddBalanceRegionHandler.reopenRegionCpu();
1155         }
1156
1157         for (i = 0; i < 2; i++)
1158         {
1159             for (j = 0; j < DIM; j++)
1160             {
1161                 fr->mu_tot[i][j] = mu[i*DIM + j];
1162             }
1163         }
1164     }
1165     if (fr->efep == efepNO)
1166     {
1167         copy_rvec(fr->mu_tot[0], mu_tot);
1168     }
1169     else
1170     {
1171         for (j = 0; j < DIM; j++)
1172         {
1173             mu_tot[j] =
1174                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1175                 lambda[efptCOUL]*fr->mu_tot[1][j];
1176         }
1177     }
1178
1179     /* Reset energies */
1180     reset_enerdata(enerd);
1181     clear_rvecs(SHIFTS, fr->fshift);
1182
1183     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1184     {
1185         wallcycle_start(wcycle, ewcPPDURINGPME);
1186         dd_force_flop_start(cr->dd, nrnb);
1187     }
1188
1189     if (inputrec->bRot)
1190     {
1191         wallcycle_start(wcycle, ewcROT);
1192         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1193         wallcycle_stop(wcycle, ewcROT);
1194     }
1195
1196     /* Temporary solution until all routines take PaddedRVecVector */
1197     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1198
1199     /* Start the force cycle counter.
1200      * Note that a different counter is used for dynamic load balancing.
1201      */
1202     wallcycle_start(wcycle, ewcFORCE);
1203
1204     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1205     if (bDoForces)
1206     {
1207         /* If we need to compute the virial, we might need a separate
1208          * force buffer for algorithms for which the virial is calculated
1209          * directly, such as PME.
1210          */
1211         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1212         {
1213             forceRef = *fr->forceBufferForDirectVirialContributions;
1214
1215             /* TODO: update comment
1216              * We only compute forces on local atoms. Note that vsites can
1217              * spread to non-local atoms, but that part of the buffer is
1218              * cleared separately in the vsite spreading code.
1219              */
1220             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1221         }
1222         /* Clear the short- and long-range forces */
1223         clear_rvecs_omp(fr->natoms_force_constr, f);
1224     }
1225
1226     /* forceWithVirial uses the local atom range only */
1227     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1228
1229     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1230     {
1231         clear_pull_forces(inputrec->pull_work);
1232     }
1233
1234     /* We calculate the non-bonded forces, when done on the CPU, here.
1235      * We do this before calling do_force_lowlevel, because in that
1236      * function, the listed forces are calculated before PME, which
1237      * does communication.  With this order, non-bonded and listed
1238      * force calculation imbalance can be balanced out by the domain
1239      * decomposition load balancing.
1240      */
1241
1242     if (!bUseOrEmulGPU)
1243     {
1244         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1245                      step, nrnb, wcycle);
1246     }
1247
1248     if (fr->efep != efepNO)
1249     {
1250         /* Calculate the local and non-local free energy interactions here.
1251          * Happens here on the CPU both with and without GPU.
1252          */
1253         wallcycle_sub_start(wcycle, ewcsNONBONDED);
1254         nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
1255                                       fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1256                                       inputrec->fepvals, lambda,
1257                                       enerd, flags, nrnb);
1258
1259         if (havePPDomainDecomposition(cr))
1260         {
1261             nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
1262                                           fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1263                                           inputrec->fepvals, lambda,
1264                                           enerd, flags, nrnb);
1265         }
1266         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
1267     }
1268
1269     if (!bUseOrEmulGPU)
1270     {
1271         if (havePPDomainDecomposition(cr))
1272         {
1273             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1274                          step, nrnb, wcycle);
1275         }
1276
1277         const Nbnxm::InteractionLocality iloc =
1278             (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1279
1280         /* Add all the non-bonded force to the normal force array.
1281          * This can be split into a local and a non-local part when overlapping
1282          * communication with calculation with domain decomposition.
1283          */
1284         wallcycle_stop(wcycle, ewcFORCE);
1285
1286         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1287
1288         wallcycle_start_nocount(wcycle, ewcFORCE);
1289
1290         /* if there are multiple fshift output buffers reduce them */
1291         if ((flags & GMX_FORCE_VIRIAL) &&
1292             nbv->pairlistSet(iloc).nnbl > 1)
1293         {
1294             /* This is not in a subcounter because it takes a
1295                negligible and constant-sized amount of time */
1296             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1297                                                      fr->fshift);
1298         }
1299     }
1300
1301     /* update QMMMrec, if necessary */
1302     if (fr->bQMMM)
1303     {
1304         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1305     }
1306
1307     /* Compute the bonded and non-bonded energies and optionally forces */
1308     do_force_lowlevel(fr, inputrec, &(top->idef),
1309                       cr, ms, nrnb, wcycle, mdatoms,
1310                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1311                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1312                       flags,
1313                       &cycles_pme, ddBalanceRegionHandler);
1314
1315     wallcycle_stop(wcycle, ewcFORCE);
1316
1317     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1318                          step, t, wcycle,
1319                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1320                          flags, &forceWithVirial, enerd,
1321                          ed, bNS);
1322
1323     if (bUseOrEmulGPU)
1324     {
1325         /* wait for non-local forces (or calculate in emulation mode) */
1326         if (havePPDomainDecomposition(cr))
1327         {
1328             if (bUseGPU)
1329             {
1330                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1331                 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1332                                             flags, Nbnxm::AtomLocality::NonLocal,
1333                                             ppForceWorkload->haveGpuBondedWork,
1334                                             enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1335                                             fr->fshift);
1336                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1337             }
1338             else
1339             {
1340                 wallcycle_start_nocount(wcycle, ewcFORCE);
1341                 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1342                              step, nrnb, wcycle);
1343                 wallcycle_stop(wcycle, ewcFORCE);
1344             }
1345
1346             /* skip the reduction if there was no non-local work to do */
1347             if (!nbv->pairlistSet(Nbnxm::InteractionLocality::NonLocal).nblGpu[0]->sci.empty())
1348             {
1349                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1350                                                nbv->nbat, f, wcycle);
1351             }
1352         }
1353     }
1354
1355     if (havePPDomainDecomposition(cr))
1356     {
1357         /* We are done with the CPU compute.
1358          * We will now communicate the non-local forces.
1359          * If we use a GPU this will overlap with GPU work, so in that case
1360          * we do not close the DD force balancing region here.
1361          */
1362         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1363
1364         if (bDoForces)
1365         {
1366             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1367         }
1368     }
1369
1370     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1371     // an alternating wait/reduction scheme.
1372     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1373     if (alternateGpuWait)
1374     {
1375         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1376     }
1377
1378     if (!alternateGpuWait && useGpuPme)
1379     {
1380         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1381     }
1382
1383     /* Wait for local GPU NB outputs on the non-alternating wait path */
1384     if (!alternateGpuWait && bUseGPU)
1385     {
1386         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1387          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1388          * but even with a step of 0.1 ms the difference is less than 1%
1389          * of the step time.
1390          */
1391         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1392
1393         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1394         Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1395                                     flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1396                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1397                                     fr->fshift);
1398         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1399
1400         if (ddBalanceRegionHandler.useBalancingRegion())
1401         {
1402             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1403             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1404             {
1405                 /* We measured few cycles, it could be that the kernel
1406                  * and transfer finished earlier and there was no actual
1407                  * wait time, only API call overhead.
1408                  * Then the actual time could be anywhere between 0 and
1409                  * cycles_wait_est. We will use half of cycles_wait_est.
1410                  */
1411                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1412             }
1413             ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1414         }
1415     }
1416
1417     if (fr->nbv->emulateGpu())
1418     {
1419         // NOTE: emulation kernel is not included in the balancing region,
1420         // but emulation mode does not target performance anyway
1421         wallcycle_start_nocount(wcycle, ewcFORCE);
1422         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1423                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1424                      step, nrnb, wcycle);
1425         wallcycle_stop(wcycle, ewcFORCE);
1426     }
1427
1428     if (useGpuPme)
1429     {
1430         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1431     }
1432
1433     if (bUseGPU)
1434     {
1435         /* now clear the GPU outputs while we finish the step on the CPU */
1436         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1437         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1438         Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1439
1440         /* Is dynamic pair-list pruning activated? */
1441         if (nbv->listParams->useDynamicPruning)
1442         {
1443             launchGpuRollingPruning(cr, nbv, inputrec, step);
1444         }
1445         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1446         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1447     }
1448
1449     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1450     {
1451         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1452         // in principle this should be included in the DD balancing region,
1453         // but generally it is infrequent so we'll omit it for the sake of
1454         // simpler code
1455         fr->gpuBonded->accumulateEnergyTerms(enerd);
1456         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1457
1458         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1459         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1460         fr->gpuBonded->clearEnergies();
1461         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1462         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1463     }
1464
1465     /* Do the nonbonded GPU (or emulation) force buffer reduction
1466      * on the non-alternating path. */
1467     if (bUseOrEmulGPU && !alternateGpuWait)
1468     {
1469         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1470                                        nbv->nbat, f, wcycle);
1471     }
1472     if (DOMAINDECOMP(cr))
1473     {
1474         dd_force_flop_stop(cr->dd, nrnb);
1475     }
1476
1477     if (bDoForces)
1478     {
1479         /* If we have NoVirSum forces, but we do not calculate the virial,
1480          * we sum fr->f_novirsum=f later.
1481          */
1482         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1483         {
1484             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1485                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1486         }
1487
1488         if (flags & GMX_FORCE_VIRIAL)
1489         {
1490             /* Calculation of the virial must be done after vsites! */
1491             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1492                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1493         }
1494     }
1495
1496     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1497     {
1498         /* In case of node-splitting, the PP nodes receive the long-range
1499          * forces, virial and energy from the PME nodes here.
1500          */
1501         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1502     }
1503
1504     if (bDoForces)
1505     {
1506         post_process_forces(cr, step, nrnb, wcycle,
1507                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1508                             vir_force, mdatoms, graph, fr, vsite,
1509                             flags);
1510     }
1511
1512     if (flags & GMX_FORCE_ENERGY)
1513     {
1514         /* Sum the potential energy terms from group contributions */
1515         sum_epot(&(enerd->grpp), enerd->term);
1516
1517         if (!EI_TPI(inputrec->eI))
1518         {
1519             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1520         }
1521     }
1522 }
1523
1524 static void do_force_cutsGROUP(FILE *fplog,
1525                                const t_commrec *cr,
1526                                const gmx_multisim_t *ms,
1527                                const t_inputrec *inputrec,
1528                                gmx::Awh *awh,
1529                                gmx_enfrot *enforcedRotation,
1530                                int64_t step,
1531                                t_nrnb *nrnb,
1532                                gmx_wallcycle_t wcycle,
1533                                gmx_localtop_t *top,
1534                                const gmx_groups_t *groups,
1535                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1536                                history_t *hist,
1537                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1538                                tensor vir_force,
1539                                const t_mdatoms *mdatoms,
1540                                gmx_enerdata_t *enerd,
1541                                t_fcdata *fcd,
1542                                real *lambda,
1543                                t_graph *graph,
1544                                t_forcerec *fr,
1545                                const gmx_vsite_t *vsite,
1546                                rvec mu_tot,
1547                                double t,
1548                                gmx_edsam *ed,
1549                                int flags,
1550                                const DDBalanceRegionHandler &ddBalanceRegionHandler)
1551 {
1552     int        cg0, cg1, i, j;
1553     double     mu[2*DIM];
1554     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1555     gmx_bool   bDoForces;
1556     float      cycles_pme;
1557
1558     const int  start  = 0;
1559     const int  homenr = mdatoms->homenr;
1560
1561     clear_mat(vir_force);
1562
1563     cg0 = 0;
1564     if (DOMAINDECOMP(cr))
1565     {
1566         cg1 = cr->dd->globalAtomGroupIndices.size();
1567     }
1568     else
1569     {
1570         cg1 = top->cgs.nr;
1571     }
1572     if (fr->n_tpi > 0)
1573     {
1574         cg1--;
1575     }
1576
1577     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1578     bNS            = ((flags & GMX_FORCE_NS) != 0);
1579     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1580     bFillGrid      = (bNS && bStateChanged);
1581     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1582     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1583
1584     if (bStateChanged)
1585     {
1586         update_forcerec(fr, box);
1587
1588         if (inputrecNeedMutot(inputrec))
1589         {
1590             /* Calculate total (local) dipole moment in a temporary common array.
1591              * This makes it possible to sum them over nodes faster.
1592              */
1593             calc_mu(start, homenr,
1594                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1595                     mu, mu+DIM);
1596         }
1597     }
1598
1599     if (fr->ePBC != epbcNONE)
1600     {
1601         /* Compute shift vectors every step,
1602          * because of pressure coupling or box deformation!
1603          */
1604         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1605         {
1606             calc_shifts(box, fr->shift_vec);
1607         }
1608
1609         if (bCalcCGCM)
1610         {
1611             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1612                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1613             inc_nrnb(nrnb, eNR_CGCM, homenr);
1614             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1615         }
1616         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1617         {
1618             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1619         }
1620     }
1621     else if (bCalcCGCM)
1622     {
1623         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1624         inc_nrnb(nrnb, eNR_CGCM, homenr);
1625     }
1626
1627     if (bCalcCGCM && gmx_debug_at)
1628     {
1629         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1630     }
1631
1632 #if GMX_MPI
1633     if (!thisRankHasDuty(cr, DUTY_PME))
1634     {
1635         /* Send particle coordinates to the pme nodes.
1636          * Since this is only implemented for domain decomposition
1637          * and domain decomposition does not use the graph,
1638          * we do not need to worry about shifting.
1639          */
1640         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1641                                  lambda[efptCOUL], lambda[efptVDW],
1642                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1643                                  step, wcycle);
1644     }
1645 #endif /* GMX_MPI */
1646
1647     /* Communicate coordinates and sum dipole if necessary */
1648     if (DOMAINDECOMP(cr))
1649     {
1650         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1651
1652         /* No GPU support, no move_x overlap, so reopen the balance region here */
1653         ddBalanceRegionHandler.reopenRegionCpu();
1654     }
1655
1656     if (inputrecNeedMutot(inputrec))
1657     {
1658         if (bStateChanged)
1659         {
1660             if (PAR(cr))
1661             {
1662                 gmx_sumd(2*DIM, mu, cr);
1663
1664                 ddBalanceRegionHandler.reopenRegionCpu();
1665             }
1666             for (i = 0; i < 2; i++)
1667             {
1668                 for (j = 0; j < DIM; j++)
1669                 {
1670                     fr->mu_tot[i][j] = mu[i*DIM + j];
1671                 }
1672             }
1673         }
1674         if (fr->efep == efepNO)
1675         {
1676             copy_rvec(fr->mu_tot[0], mu_tot);
1677         }
1678         else
1679         {
1680             for (j = 0; j < DIM; j++)
1681             {
1682                 mu_tot[j] =
1683                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1684             }
1685         }
1686     }
1687
1688     /* Reset energies */
1689     reset_enerdata(enerd);
1690     clear_rvecs(SHIFTS, fr->fshift);
1691
1692     if (bNS)
1693     {
1694         wallcycle_start(wcycle, ewcNS);
1695
1696         if (graph && bStateChanged)
1697         {
1698             /* Calculate intramolecular shift vectors to make molecules whole */
1699             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1700         }
1701
1702         /* Do the actual neighbour searching */
1703         ns(fplog, fr, box,
1704            groups, top, mdatoms,
1705            cr, nrnb, bFillGrid);
1706
1707         wallcycle_stop(wcycle, ewcNS);
1708     }
1709
1710     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1711     {
1712         wallcycle_start(wcycle, ewcPPDURINGPME);
1713         dd_force_flop_start(cr->dd, nrnb);
1714     }
1715
1716     if (inputrec->bRot)
1717     {
1718         wallcycle_start(wcycle, ewcROT);
1719         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1720         wallcycle_stop(wcycle, ewcROT);
1721     }
1722
1723     /* Temporary solution until all routines take PaddedRVecVector */
1724     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1725
1726     /* Start the force cycle counter.
1727      * Note that a different counter is used for dynamic load balancing.
1728      */
1729     wallcycle_start(wcycle, ewcFORCE);
1730
1731     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1732     if (bDoForces)
1733     {
1734         /* If we need to compute the virial, we might need a separate
1735          * force buffer for algorithms for which the virial is calculated
1736          * separately, such as PME.
1737          */
1738         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1739         {
1740             forceRef = *fr->forceBufferForDirectVirialContributions;
1741
1742             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1743         }
1744
1745         /* Clear the short- and long-range forces */
1746         clear_rvecs(fr->natoms_force_constr, f);
1747     }
1748
1749     /* forceWithVirial might need the full force atom range */
1750     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1751
1752     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1753     {
1754         clear_pull_forces(inputrec->pull_work);
1755     }
1756
1757     /* update QMMMrec, if necessary */
1758     if (fr->bQMMM)
1759     {
1760         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1761     }
1762
1763     /* Compute the bonded and non-bonded energies and optionally forces */
1764     do_force_lowlevel(fr, inputrec, &(top->idef),
1765                       cr, ms, nrnb, wcycle, mdatoms,
1766                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1767                       box, inputrec->fepvals, lambda,
1768                       graph, &(top->excls), fr->mu_tot,
1769                       flags,
1770                       &cycles_pme, ddBalanceRegionHandler);
1771
1772     wallcycle_stop(wcycle, ewcFORCE);
1773
1774     if (DOMAINDECOMP(cr))
1775     {
1776         dd_force_flop_stop(cr->dd, nrnb);
1777
1778         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1779     }
1780
1781     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1782                          step, t, wcycle,
1783                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1784                          flags, &forceWithVirial, enerd,
1785                          ed, bNS);
1786
1787     if (bDoForces)
1788     {
1789         /* Communicate the forces */
1790         if (DOMAINDECOMP(cr))
1791         {
1792             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1793             /* Do we need to communicate the separate force array
1794              * for terms that do not contribute to the single sum virial?
1795              * Position restraints and electric fields do not introduce
1796              * inter-cg forces, only full electrostatics methods do.
1797              * When we do not calculate the virial, fr->f_novirsum = f,
1798              * so we have already communicated these forces.
1799              */
1800             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1801                 (flags & GMX_FORCE_VIRIAL))
1802             {
1803                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1804             }
1805         }
1806
1807         /* If we have NoVirSum forces, but we do not calculate the virial,
1808          * we sum fr->f_novirsum=f later.
1809          */
1810         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1811         {
1812             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1813                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1814         }
1815
1816         if (flags & GMX_FORCE_VIRIAL)
1817         {
1818             /* Calculation of the virial must be done after vsites! */
1819             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1820                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1821         }
1822     }
1823
1824     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1825     {
1826         /* In case of node-splitting, the PP nodes receive the long-range
1827          * forces, virial and energy from the PME nodes here.
1828          */
1829         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1830     }
1831
1832     if (bDoForces)
1833     {
1834         post_process_forces(cr, step, nrnb, wcycle,
1835                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1836                             vir_force, mdatoms, graph, fr, vsite,
1837                             flags);
1838     }
1839
1840     if (flags & GMX_FORCE_ENERGY)
1841     {
1842         /* Sum the potential energy terms from group contributions */
1843         sum_epot(&(enerd->grpp), enerd->term);
1844
1845         if (!EI_TPI(inputrec->eI))
1846         {
1847             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1848         }
1849     }
1850
1851 }
1852
1853 void do_force(FILE                                     *fplog,
1854               const t_commrec                          *cr,
1855               const gmx_multisim_t                     *ms,
1856               const t_inputrec                         *inputrec,
1857               gmx::Awh                                 *awh,
1858               gmx_enfrot                               *enforcedRotation,
1859               int64_t                                   step,
1860               t_nrnb                                   *nrnb,
1861               gmx_wallcycle_t                           wcycle,
1862               gmx_localtop_t                           *top,
1863               const gmx_groups_t                       *groups,
1864               matrix                                    box,
1865               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
1866               history_t                                *hist,
1867               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
1868               tensor                                    vir_force,
1869               const t_mdatoms                          *mdatoms,
1870               gmx_enerdata_t                           *enerd,
1871               t_fcdata                                 *fcd,
1872               gmx::ArrayRef<real>                       lambda,
1873               t_graph                                  *graph,
1874               t_forcerec                               *fr,
1875               gmx::PpForceWorkload                     *ppForceWorkload,
1876               const gmx_vsite_t                        *vsite,
1877               rvec                                      mu_tot,
1878               double                                    t,
1879               gmx_edsam                                *ed,
1880               int                                       flags,
1881               const DDBalanceRegionHandler             &ddBalanceRegionHandler)
1882 {
1883     /* modify force flag if not doing nonbonded */
1884     if (!fr->bNonbonded)
1885     {
1886         flags &= ~GMX_FORCE_NONBONDED;
1887     }
1888
1889     switch (inputrec->cutoff_scheme)
1890     {
1891         case ecutsVERLET:
1892             do_force_cutsVERLET(fplog, cr, ms, inputrec,
1893                                 awh, enforcedRotation, step, nrnb, wcycle,
1894                                 top,
1895                                 groups,
1896                                 box, x, hist,
1897                                 force, vir_force,
1898                                 mdatoms,
1899                                 enerd, fcd,
1900                                 lambda.data(), graph,
1901                                 fr,
1902                                 ppForceWorkload,
1903                                 fr->ic,
1904                                 vsite, mu_tot,
1905                                 t, ed,
1906                                 flags,
1907                                 ddBalanceRegionHandler);
1908             break;
1909         case ecutsGROUP:
1910             do_force_cutsGROUP(fplog, cr, ms, inputrec,
1911                                awh, enforcedRotation, step, nrnb, wcycle,
1912                                top,
1913                                groups,
1914                                box, x, hist,
1915                                force, vir_force,
1916                                mdatoms,
1917                                enerd, fcd,
1918                                lambda.data(), graph,
1919                                fr, vsite, mu_tot,
1920                                t, ed,
1921                                flags,
1922                                ddBalanceRegionHandler);
1923             break;
1924         default:
1925             gmx_incons("Invalid cut-off scheme passed!");
1926     }
1927
1928     /* In case we don't have constraints and are using GPUs, the next balancing
1929      * region starts here.
1930      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1931      * virial calculation and COM pulling, is not thus not included in
1932      * the balance timing, which is ok as most tasks do communication.
1933      */
1934     ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
1935 }
1936
1937
1938 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
1939                         const t_inputrec *ir, const t_mdatoms *md,
1940                         t_state *state)
1941 {
1942     int             i, m, start, end;
1943     int64_t         step;
1944     real            dt = ir->delta_t;
1945     real            dvdl_dum;
1946     rvec           *savex;
1947
1948     /* We need to allocate one element extra, since we might use
1949      * (unaligned) 4-wide SIMD loads to access rvec entries.
1950      */
1951     snew(savex, state->natoms + 1);
1952
1953     start = 0;
1954     end   = md->homenr;
1955
1956     if (debug)
1957     {
1958         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1959                 start, md->homenr, end);
1960     }
1961     /* Do a first constrain to reset particles... */
1962     step = ir->init_step;
1963     if (fplog)
1964     {
1965         char buf[STEPSTRSIZE];
1966         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1967                 gmx_step_str(step, buf));
1968     }
1969     dvdl_dum = 0;
1970
1971     /* constrain the current position */
1972     constr->apply(TRUE, FALSE,
1973                   step, 0, 1.0,
1974                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
1975                   state->box,
1976                   state->lambda[efptBONDED], &dvdl_dum,
1977                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
1978     if (EI_VV(ir->eI))
1979     {
1980         /* constrain the inital velocity, and save it */
1981         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1982         constr->apply(TRUE, FALSE,
1983                       step, 0, 1.0,
1984                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
1985                       state->box,
1986                       state->lambda[efptBONDED], &dvdl_dum,
1987                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
1988     }
1989     /* constrain the inital velocities at t-dt/2 */
1990     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1991     {
1992         auto x = makeArrayRef(state->x).subArray(start, end);
1993         auto v = makeArrayRef(state->v).subArray(start, end);
1994         for (i = start; (i < end); i++)
1995         {
1996             for (m = 0; (m < DIM); m++)
1997             {
1998                 /* Reverse the velocity */
1999                 v[i][m] = -v[i][m];
2000                 /* Store the position at t-dt in buf */
2001                 savex[i][m] = x[i][m] + dt*v[i][m];
2002             }
2003         }
2004         /* Shake the positions at t=-dt with the positions at t=0
2005          * as reference coordinates.
2006          */
2007         if (fplog)
2008         {
2009             char buf[STEPSTRSIZE];
2010             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2011                     gmx_step_str(step, buf));
2012         }
2013         dvdl_dum = 0;
2014         constr->apply(TRUE, FALSE,
2015                       step, -1, 1.0,
2016                       state->x.rvec_array(), savex, nullptr,
2017                       state->box,
2018                       state->lambda[efptBONDED], &dvdl_dum,
2019                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2020
2021         for (i = start; i < end; i++)
2022         {
2023             for (m = 0; m < DIM; m++)
2024             {
2025                 /* Re-reverse the velocities */
2026                 v[i][m] = -v[i][m];
2027             }
2028         }
2029     }
2030     sfree(savex);
2031 }
2032
2033
2034 static void
2035 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2036                 double *enerout, double *virout)
2037 {
2038     double enersum, virsum;
2039     double invscale, invscale2, invscale3;
2040     double r, ea, eb, ec, pa, pb, pc, pd;
2041     double y0, f, g, h;
2042     int    ri, offset;
2043     double tabfactor;
2044
2045     invscale  = 1.0/scale;
2046     invscale2 = invscale*invscale;
2047     invscale3 = invscale*invscale2;
2048
2049     /* Following summation derived from cubic spline definition,
2050      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2051      * the cubic spline.  We first calculate the negative of the
2052      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2053      * add the more standard, abrupt cutoff correction to that result,
2054      * yielding the long-range correction for a switched function.  We
2055      * perform both the pressure and energy loops at the same time for
2056      * simplicity, as the computational cost is low. */
2057
2058     if (offstart == 0)
2059     {
2060         /* Since the dispersion table has been scaled down a factor
2061          * 6.0 and the repulsion a factor 12.0 to compensate for the
2062          * c6/c12 parameters inside nbfp[] being scaled up (to save
2063          * flops in kernels), we need to correct for this.
2064          */
2065         tabfactor = 6.0;
2066     }
2067     else
2068     {
2069         tabfactor = 12.0;
2070     }
2071
2072     enersum = 0.0;
2073     virsum  = 0.0;
2074     for (ri = rstart; ri < rend; ++ri)
2075     {
2076         r  = ri*invscale;
2077         ea = invscale3;
2078         eb = 2.0*invscale2*r;
2079         ec = invscale*r*r;
2080
2081         pa = invscale3;
2082         pb = 3.0*invscale2*r;
2083         pc = 3.0*invscale*r*r;
2084         pd = r*r*r;
2085
2086         /* this "8" is from the packing in the vdwtab array - perhaps
2087            should be defined? */
2088
2089         offset = 8*ri + offstart;
2090         y0     = vdwtab[offset];
2091         f      = vdwtab[offset+1];
2092         g      = vdwtab[offset+2];
2093         h      = vdwtab[offset+3];
2094
2095         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2096         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2097     }
2098     *enerout = 4.0*M_PI*enersum*tabfactor;
2099     *virout  = 4.0*M_PI*virsum*tabfactor;
2100 }
2101
2102 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2103 {
2104     double   eners[2], virs[2], enersum, virsum;
2105     double   r0, rc3, rc9;
2106     int      ri0, ri1, i;
2107     real     scale, *vdwtab;
2108
2109     fr->enershiftsix    = 0;
2110     fr->enershifttwelve = 0;
2111     fr->enerdiffsix     = 0;
2112     fr->enerdifftwelve  = 0;
2113     fr->virdiffsix      = 0;
2114     fr->virdifftwelve   = 0;
2115
2116     const interaction_const_t *ic = fr->ic;
2117
2118     if (eDispCorr != edispcNO)
2119     {
2120         for (i = 0; i < 2; i++)
2121         {
2122             eners[i] = 0;
2123             virs[i]  = 0;
2124         }
2125         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2126             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2127             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2128             (ic->vdwtype == evdwSHIFT) ||
2129             (ic->vdwtype == evdwSWITCH))
2130         {
2131             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2132                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2133                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2134             {
2135                 gmx_fatal(FARGS,
2136                           "With dispersion correction rvdw-switch can not be zero "
2137                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2138             }
2139
2140             /* TODO This code depends on the logic in tables.c that
2141                constructs the table layout, which should be made
2142                explicit in future cleanup. */
2143             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2144                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2145             scale  = fr->dispersionCorrectionTable->scale;
2146             vdwtab = fr->dispersionCorrectionTable->data;
2147
2148             /* Round the cut-offs to exact table values for precision */
2149             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2150             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2151
2152             /* The code below has some support for handling force-switching, i.e.
2153              * when the force (instead of potential) is switched over a limited
2154              * region. This leads to a constant shift in the potential inside the
2155              * switching region, which we can handle by adding a constant energy
2156              * term in the force-switch case just like when we do potential-shift.
2157              *
2158              * For now this is not enabled, but to keep the functionality in the
2159              * code we check separately for switch and shift. When we do force-switch
2160              * the shifting point is rvdw_switch, while it is the cutoff when we
2161              * have a classical potential-shift.
2162              *
2163              * For a pure potential-shift the potential has a constant shift
2164              * all the way out to the cutoff, and that is it. For other forms
2165              * we need to calculate the constant shift up to the point where we
2166              * start modifying the potential.
2167              */
2168             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2169
2170             r0   = ri0/scale;
2171             rc3  = r0*r0*r0;
2172             rc9  = rc3*rc3*rc3;
2173
2174             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2175                 (ic->vdwtype == evdwSHIFT))
2176             {
2177                 /* Determine the constant energy shift below rvdw_switch.
2178                  * Table has a scale factor since we have scaled it down to compensate
2179                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2180                  */
2181                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2182                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2183             }
2184             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2185             {
2186                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2187                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2188             }
2189
2190             /* Add the constant part from 0 to rvdw_switch.
2191              * This integration from 0 to rvdw_switch overcounts the number
2192              * of interactions by 1, as it also counts the self interaction.
2193              * We will correct for this later.
2194              */
2195             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2196             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2197
2198             /* Calculate the contribution in the range [r0,r1] where we
2199              * modify the potential. For a pure potential-shift modifier we will
2200              * have ri0==ri1, and there will not be any contribution here.
2201              */
2202             for (i = 0; i < 2; i++)
2203             {
2204                 enersum = 0;
2205                 virsum  = 0;
2206                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2207                 eners[i] -= enersum;
2208                 virs[i]  -= virsum;
2209             }
2210
2211             /* Alright: Above we compensated by REMOVING the parts outside r0
2212              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2213              *
2214              * Regardless of whether r0 is the point where we start switching,
2215              * or the cutoff where we calculated the constant shift, we include
2216              * all the parts we are missing out to infinity from r0 by
2217              * calculating the analytical dispersion correction.
2218              */
2219             eners[0] += -4.0*M_PI/(3.0*rc3);
2220             eners[1] +=  4.0*M_PI/(9.0*rc9);
2221             virs[0]  +=  8.0*M_PI/rc3;
2222             virs[1]  += -16.0*M_PI/(3.0*rc9);
2223         }
2224         else if (ic->vdwtype == evdwCUT ||
2225                  EVDW_PME(ic->vdwtype) ||
2226                  ic->vdwtype == evdwUSER)
2227         {
2228             if (ic->vdwtype == evdwUSER && fplog)
2229             {
2230                 fprintf(fplog,
2231                         "WARNING: using dispersion correction with user tables\n");
2232             }
2233
2234             /* Note that with LJ-PME, the dispersion correction is multiplied
2235              * by the difference between the actual C6 and the value of C6
2236              * that would produce the combination rule.
2237              * This means the normal energy and virial difference formulas
2238              * can be used here.
2239              */
2240
2241             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2242             rc9  = rc3*rc3*rc3;
2243             /* Contribution beyond the cut-off */
2244             eners[0] += -4.0*M_PI/(3.0*rc3);
2245             eners[1] +=  4.0*M_PI/(9.0*rc9);
2246             if (ic->vdw_modifier == eintmodPOTSHIFT)
2247             {
2248                 /* Contribution within the cut-off */
2249                 eners[0] += -4.0*M_PI/(3.0*rc3);
2250                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2251             }
2252             /* Contribution beyond the cut-off */
2253             virs[0]  +=  8.0*M_PI/rc3;
2254             virs[1]  += -16.0*M_PI/(3.0*rc9);
2255         }
2256         else
2257         {
2258             gmx_fatal(FARGS,
2259                       "Dispersion correction is not implemented for vdw-type = %s",
2260                       evdw_names[ic->vdwtype]);
2261         }
2262
2263         /* When we deprecate the group kernels the code below can go too */
2264         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2265         {
2266             /* Calculate self-interaction coefficient (assuming that
2267              * the reciprocal-space contribution is constant in the
2268              * region that contributes to the self-interaction).
2269              */
2270             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2271
2272             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2273             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2274         }
2275
2276         fr->enerdiffsix    = eners[0];
2277         fr->enerdifftwelve = eners[1];
2278         /* The 0.5 is due to the Gromacs definition of the virial */
2279         fr->virdiffsix     = 0.5*virs[0];
2280         fr->virdifftwelve  = 0.5*virs[1];
2281     }
2282 }
2283
2284 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2285                    const matrix box, real lambda, tensor pres, tensor virial,
2286                    real *prescorr, real *enercorr, real *dvdlcorr)
2287 {
2288     gmx_bool bCorrAll, bCorrPres;
2289     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2290     int      m;
2291
2292     *prescorr = 0;
2293     *enercorr = 0;
2294     *dvdlcorr = 0;
2295
2296     clear_mat(virial);
2297     clear_mat(pres);
2298
2299     if (ir->eDispCorr != edispcNO)
2300     {
2301         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2302                      ir->eDispCorr == edispcAllEnerPres);
2303         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2304                      ir->eDispCorr == edispcAllEnerPres);
2305
2306         invvol = 1/det(box);
2307         if (fr->n_tpi)
2308         {
2309             /* Only correct for the interactions with the inserted molecule */
2310             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2311             ninter = fr->n_tpi;
2312         }
2313         else
2314         {
2315             dens   = fr->numAtomsForDispersionCorrection*invvol;
2316             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2317         }
2318
2319         if (ir->efep == efepNO)
2320         {
2321             avcsix    = fr->avcsix[0];
2322             avctwelve = fr->avctwelve[0];
2323         }
2324         else
2325         {
2326             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2327             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2328         }
2329
2330         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2331         *enercorr += avcsix*enerdiff;
2332         dvdlambda  = 0.0;
2333         if (ir->efep != efepNO)
2334         {
2335             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2336         }
2337         if (bCorrAll)
2338         {
2339             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2340             *enercorr += avctwelve*enerdiff;
2341             if (fr->efep != efepNO)
2342             {
2343                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2344             }
2345         }
2346
2347         if (bCorrPres)
2348         {
2349             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2350             if (ir->eDispCorr == edispcAllEnerPres)
2351             {
2352                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2353             }
2354             /* The factor 2 is because of the Gromacs virial definition */
2355             spres = -2.0*invvol*svir*PRESFAC;
2356
2357             for (m = 0; m < DIM; m++)
2358             {
2359                 virial[m][m] += svir;
2360                 pres[m][m]   += spres;
2361             }
2362             *prescorr += spres;
2363         }
2364
2365         /* Can't currently control when it prints, for now, just print when degugging */
2366         if (debug)
2367         {
2368             if (bCorrAll)
2369             {
2370                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2371                         avcsix, avctwelve);
2372             }
2373             if (bCorrPres)
2374             {
2375                 fprintf(debug,
2376                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2377                         *enercorr, spres, svir);
2378             }
2379             else
2380             {
2381                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2382             }
2383         }
2384
2385         if (fr->efep != efepNO)
2386         {
2387             *dvdlcorr += dvdlambda;
2388         }
2389     }
2390 }
2391
2392 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2393                             const gmx_mtop_t *mtop, rvec x[],
2394                             gmx_bool bFirst)
2395 {
2396     t_graph        *graph;
2397     int             as, mol;
2398
2399     if (bFirst && fplog)
2400     {
2401         fprintf(fplog, "Removing pbc first time\n");
2402     }
2403
2404     snew(graph, 1);
2405     as = 0;
2406     for (const gmx_molblock_t &molb : mtop->molblock)
2407     {
2408         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2409         if (moltype.atoms.nr == 1 ||
2410             (!bFirst && moltype.cgs.nr == 1))
2411         {
2412             /* Just one atom or charge group in the molecule, no PBC required */
2413             as += molb.nmol*moltype.atoms.nr;
2414         }
2415         else
2416         {
2417             mk_graph_moltype(moltype, graph);
2418
2419             for (mol = 0; mol < molb.nmol; mol++)
2420             {
2421                 mk_mshift(fplog, graph, ePBC, box, x+as);
2422
2423                 shift_self(graph, box, x+as);
2424                 /* The molecule is whole now.
2425                  * We don't need the second mk_mshift call as in do_pbc_first,
2426                  * since we no longer need this graph.
2427                  */
2428
2429                 as += moltype.atoms.nr;
2430             }
2431             done_graph(graph);
2432         }
2433     }
2434     sfree(graph);
2435 }
2436
2437 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2438                        const gmx_mtop_t *mtop, rvec x[])
2439 {
2440     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2441 }
2442
2443 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2444                  const gmx_mtop_t *mtop, rvec x[])
2445 {
2446     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2447 }
2448
2449 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2450 {
2451     int t, nth;
2452     nth = gmx_omp_nthreads_get(emntDefault);
2453
2454 #pragma omp parallel for num_threads(nth) schedule(static)
2455     for (t = 0; t < nth; t++)
2456     {
2457         try
2458         {
2459             size_t natoms = x.size();
2460             size_t offset = (natoms*t    )/nth;
2461             size_t len    = (natoms*(t + 1))/nth - offset;
2462             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2463         }
2464         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2465     }
2466 }
2467
2468 // TODO This can be cleaned up a lot, and move back to runner.cpp
2469 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2470                 const t_inputrec *inputrec,
2471                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2472                 gmx_walltime_accounting_t walltime_accounting,
2473                 nonbonded_verlet_t *nbv,
2474                 const gmx_pme_t *pme,
2475                 gmx_bool bWriteStat)
2476 {
2477     t_nrnb *nrnb_tot = nullptr;
2478     double  delta_t  = 0;
2479     double  nbfs     = 0, mflop = 0;
2480     double  elapsed_time,
2481             elapsed_time_over_all_ranks,
2482             elapsed_time_over_all_threads,
2483             elapsed_time_over_all_threads_over_all_ranks;
2484     /* Control whether it is valid to print a report. Only the
2485        simulation master may print, but it should not do so if the run
2486        terminated e.g. before a scheduled reset step. This is
2487        complicated by the fact that PME ranks are unaware of the
2488        reason why they were sent a pmerecvqxFINISH. To avoid
2489        communication deadlocks, we always do the communication for the
2490        report, even if we've decided not to write the report, because
2491        how long it takes to finish the run is not important when we've
2492        decided not to report on the simulation performance.
2493
2494        Further, we only report performance for dynamical integrators,
2495        because those are the only ones for which we plan to
2496        consider doing any optimizations. */
2497     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2498
2499     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2500     {
2501         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2502         printReport = false;
2503     }
2504
2505     if (cr->nnodes > 1)
2506     {
2507         snew(nrnb_tot, 1);
2508 #if GMX_MPI
2509         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2510                       cr->mpi_comm_mysim);
2511 #endif
2512     }
2513     else
2514     {
2515         nrnb_tot = nrnb;
2516     }
2517
2518     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2519     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2520     if (cr->nnodes > 1)
2521     {
2522 #if GMX_MPI
2523         /* reduce elapsed_time over all MPI ranks in the current simulation */
2524         MPI_Allreduce(&elapsed_time,
2525                       &elapsed_time_over_all_ranks,
2526                       1, MPI_DOUBLE, MPI_SUM,
2527                       cr->mpi_comm_mysim);
2528         elapsed_time_over_all_ranks /= cr->nnodes;
2529         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2530          * current simulation. */
2531         MPI_Allreduce(&elapsed_time_over_all_threads,
2532                       &elapsed_time_over_all_threads_over_all_ranks,
2533                       1, MPI_DOUBLE, MPI_SUM,
2534                       cr->mpi_comm_mysim);
2535 #endif
2536     }
2537     else
2538     {
2539         elapsed_time_over_all_ranks                  = elapsed_time;
2540         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2541     }
2542
2543     if (printReport)
2544     {
2545         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2546     }
2547     if (cr->nnodes > 1)
2548     {
2549         sfree(nrnb_tot);
2550     }
2551
2552     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2553     {
2554         print_dd_statistics(cr, inputrec, fplog);
2555     }
2556
2557     /* TODO Move the responsibility for any scaling by thread counts
2558      * to the code that handled the thread region, so that there's a
2559      * mechanism to keep cycle counting working during the transition
2560      * to task parallelism. */
2561     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2562     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2563     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2564     auto cycle_sum(wallcycle_sum(cr, wcycle));
2565
2566     if (printReport)
2567     {
2568         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2569         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2570         if (pme_gpu_task_enabled(pme))
2571         {
2572             pme_gpu_get_timings(pme, &pme_gpu_timings);
2573         }
2574         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2575                         elapsed_time_over_all_ranks,
2576                         wcycle, cycle_sum,
2577                         nbnxn_gpu_timings,
2578                         &pme_gpu_timings);
2579
2580         if (EI_DYNAMICS(inputrec->eI))
2581         {
2582             delta_t = inputrec->delta_t;
2583         }
2584
2585         if (fplog)
2586         {
2587             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2588                        elapsed_time_over_all_ranks,
2589                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2590                        delta_t, nbfs, mflop);
2591         }
2592         if (bWriteStat)
2593         {
2594             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2595                        elapsed_time_over_all_ranks,
2596                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2597                        delta_t, nbfs, mflop);
2598         }
2599     }
2600 }
2601
2602 void initialize_lambdas(FILE               *fplog,
2603                         const t_inputrec   &ir,
2604                         bool                isMaster,
2605                         int                *fep_state,
2606                         gmx::ArrayRef<real> lambda,
2607                         double             *lam0)
2608 {
2609     /* TODO: Clean up initialization of fep_state and lambda in
2610        t_state.  This function works, but could probably use a logic
2611        rewrite to keep all the different types of efep straight. */
2612
2613     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2614     {
2615         return;
2616     }
2617
2618     const t_lambda *fep = ir.fepvals;
2619     if (isMaster)
2620     {
2621         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2622                                              if checkpoint is set -- a kludge is in for now
2623                                              to prevent this.*/
2624     }
2625
2626     for (int i = 0; i < efptNR; i++)
2627     {
2628         double thisLambda;
2629         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2630         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2631         {
2632             thisLambda = fep->init_lambda;
2633         }
2634         else
2635         {
2636             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2637         }
2638         if (isMaster)
2639         {
2640             lambda[i] = thisLambda;
2641         }
2642         if (lam0 != nullptr)
2643         {
2644             lam0[i] = thisLambda;
2645         }
2646     }
2647     if (ir.bSimTemp)
2648     {
2649         /* need to rescale control temperatures to match current state */
2650         for (int i = 0; i < ir.opts.ngtc; i++)
2651         {
2652             if (ir.opts.ref_t[i] > 0)
2653             {
2654                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2655             }
2656         }
2657     }
2658
2659     /* Send to the log the information on the current lambdas */
2660     if (fplog != nullptr)
2661     {
2662         fprintf(fplog, "Initial vector of lambda components:[ ");
2663         for (const auto &l : lambda)
2664         {
2665             fprintf(fplog, "%10.4f ", l);
2666         }
2667         fprintf(fplog, "]\n");
2668     }
2669 }