Hide internals of nbnxm parlist
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
47
48 #include <array>
49
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
117
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
123
124
125 void print_time(FILE                     *out,
126                 gmx_walltime_accounting_t walltime_accounting,
127                 int64_t                   step,
128                 const t_inputrec         *ir,
129                 const t_commrec          *cr)
130 {
131     time_t finish;
132     double dt, elapsed_seconds, time_per_step;
133
134 #if !GMX_THREAD_MPI
135     if (!PAR(cr))
136 #endif
137     {
138         fprintf(out, "\r");
139     }
140     fputs("step ", out);
141     fputs(gmx::int64ToString(step).c_str(), out);
142     fflush(out);
143
144     if ((step >= ir->nstlist))
145     {
146         double seconds_since_epoch = gmx_gettime();
147         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
149         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
150
151         if (ir->nsteps >= 0)
152         {
153             if (dt >= 300)
154             {
155                 finish = static_cast<time_t>(seconds_since_epoch + dt);
156                 auto timebuf = gmx_ctime_r(&finish);
157                 timebuf.erase(timebuf.find_first_of('\n'));
158                 fputs(", will finish ", out);
159                 fputs(timebuf.c_str(), out);
160             }
161             else
162             {
163                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
164             }
165         }
166         else
167         {
168             fprintf(out, " performance: %.1f ns/day    ",
169                     ir->delta_t/1000*24*60*60/time_per_step);
170         }
171     }
172 #if !GMX_THREAD_MPI
173     if (PAR(cr))
174     {
175         fprintf(out, "\n");
176     }
177 #else
178     GMX_UNUSED_VALUE(cr);
179 #endif
180
181     fflush(out);
182 }
183
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185                          double the_time)
186 {
187     if (!fplog)
188     {
189         return;
190     }
191
192     time_t temp_time = static_cast<time_t>(the_time);
193
194     auto   timebuf = gmx_ctime_r(&temp_time);
195
196     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
197 }
198
199 void print_start(FILE *fplog, const t_commrec *cr,
200                  gmx_walltime_accounting_t walltime_accounting,
201                  const char *name)
202 {
203     char buf[STRLEN];
204
205     sprintf(buf, "Started %s", name);
206     print_date_and_time(fplog, cr->nodeid, buf,
207                         walltime_accounting_get_start_time_stamp(walltime_accounting));
208 }
209
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
211 {
212     const int      end = forceToAdd.size();
213
214     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216     for (int i = 0; i < end; i++)
217     {
218         rvec_inc(f[i], forceToAdd[i]);
219     }
220 }
221
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223                         tensor vir_part, const t_graph *graph, const matrix box,
224                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
225 {
226     /* The short-range virial from surrounding boxes */
227     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
229
230     /* Calculate partial virial, for local atoms only, based on short range.
231      * Total virial is computed in global_stat, called from do_md
232      */
233     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
235
236     if (debug)
237     {
238         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
239     }
240 }
241
242 static void pull_potential_wrapper(const t_commrec *cr,
243                                    const t_inputrec *ir,
244                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245                                    gmx::ForceWithVirial *force,
246                                    const t_mdatoms *mdatoms,
247                                    gmx_enerdata_t *enerd,
248                                    const real *lambda,
249                                    double t,
250                                    gmx_wallcycle_t wcycle)
251 {
252     t_pbc  pbc;
253     real   dvdl;
254
255     /* Calculate the center of mass forces, this requires communication,
256      * which is why pull_potential is called close to other communication.
257      */
258     wallcycle_start(wcycle, ewcPULLPOT);
259     set_pbc(&pbc, ir->ePBC, box);
260     dvdl                     = 0;
261     enerd->term[F_COM_PULL] +=
262         pull_potential(ir->pull_work, mdatoms, &pbc,
263                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265     wallcycle_stop(wcycle, ewcPULLPOT);
266 }
267
268 static void pme_receive_force_ener(const t_commrec      *cr,
269                                    gmx::ForceWithVirial *forceWithVirial,
270                                    gmx_enerdata_t       *enerd,
271                                    gmx_wallcycle_t       wcycle)
272 {
273     real   e_q, e_lj, dvdl_q, dvdl_lj;
274     float  cycles_ppdpme, cycles_seppme;
275
276     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
278
279     /* In case of node-splitting, the PP nodes receive the long-range
280      * forces, virial and energy from the PME nodes here.
281      */
282     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
283     dvdl_q  = 0;
284     dvdl_lj = 0;
285     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
286                       &cycles_seppme);
287     enerd->term[F_COUL_RECIP] += e_q;
288     enerd->term[F_LJ_RECIP]   += e_lj;
289     enerd->dvdl_lin[efptCOUL] += dvdl_q;
290     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
291
292     if (wcycle)
293     {
294         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
295     }
296     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
297 }
298
299 static void print_large_forces(FILE            *fp,
300                                const t_mdatoms *md,
301                                const t_commrec *cr,
302                                int64_t          step,
303                                real             forceTolerance,
304                                const rvec      *x,
305                                const rvec      *f)
306 {
307     real           force2Tolerance = gmx::square(forceTolerance);
308     gmx::index     numNonFinite    = 0;
309     for (int i = 0; i < md->homenr; i++)
310     {
311         real force2    = norm2(f[i]);
312         bool nonFinite = !std::isfinite(force2);
313         if (force2 >= force2Tolerance || nonFinite)
314         {
315             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
316                     step,
317                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
318         }
319         if (nonFinite)
320         {
321             numNonFinite++;
322         }
323     }
324     if (numNonFinite > 0)
325     {
326         /* Note that with MPI this fatal call on one rank might interrupt
327          * the printing on other ranks. But we can only avoid that with
328          * an expensive MPI barrier that we would need at each step.
329          */
330         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
331     }
332 }
333
334 static void post_process_forces(const t_commrec           *cr,
335                                 int64_t                    step,
336                                 t_nrnb                    *nrnb,
337                                 gmx_wallcycle_t            wcycle,
338                                 const gmx_localtop_t      *top,
339                                 const matrix               box,
340                                 const rvec                 x[],
341                                 rvec                       f[],
342                                 gmx::ForceWithVirial      *forceWithVirial,
343                                 tensor                     vir_force,
344                                 const t_mdatoms           *mdatoms,
345                                 const t_graph             *graph,
346                                 const t_forcerec          *fr,
347                                 const gmx_vsite_t         *vsite,
348                                 int                        flags)
349 {
350     if (fr->haveDirectVirialContributions)
351     {
352         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
353
354         if (vsite)
355         {
356             /* Spread the mesh force on virtual sites to the other particles...
357              * This is parallellized. MPI communication is performed
358              * if the constructing atoms aren't local.
359              */
360             matrix virial = { { 0 } };
361             spread_vsite_f(vsite, x, fDirectVir, nullptr,
362                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
363                            nrnb,
364                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365             forceWithVirial->addVirialContribution(virial);
366         }
367
368         if (flags & GMX_FORCE_VIRIAL)
369         {
370             /* Now add the forces, this is local */
371             sum_forces(f, forceWithVirial->force_);
372
373             /* Add the direct virial contributions */
374             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
376
377             if (debug)
378             {
379                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
380             }
381         }
382     }
383
384     if (fr->print_force >= 0)
385     {
386         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
387     }
388 }
389
390 static void do_nb_verlet(t_forcerec                       *fr,
391                          const interaction_const_t        *ic,
392                          gmx_enerdata_t                   *enerd,
393                          const int                         flags,
394                          const Nbnxm::InteractionLocality  ilocality,
395                          const int                         clearF,
396                          const int64_t                     step,
397                          t_nrnb                           *nrnb,
398                          gmx_wallcycle_t                   wcycle)
399 {
400     if (!(flags & GMX_FORCE_NONBONDED))
401     {
402         /* skip non-bonded calculation */
403         return;
404     }
405
406     nonbonded_verlet_t *nbv  = fr->nbv;
407
408     /* GPU kernel launch overhead is already timed separately */
409     if (fr->cutoff_scheme != ecutsVERLET)
410     {
411         gmx_incons("Invalid cut-off scheme passed!");
412     }
413
414     if (!nbv->useGpu())
415     {
416         /* When dynamic pair-list  pruning is requested, we need to prune
417          * at nstlistPrune steps.
418          */
419         if (nbv->pairlistSets().isDynamicPairlistPruningStep(step))
420         {
421             /* Prune the pair-list beyond fr->ic->rlistPrune using
422              * the current coordinates of the atoms.
423              */
424             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
425             nbv->dispatchPruneKernel(ilocality, fr->shift_vec);
426             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
427         }
428
429         wallcycle_sub_start(wcycle, ewcsNONBONDED);
430     }
431
432     nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, fr, enerd, nrnb);
433
434     if (!nbv->useGpu())
435     {
436         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
437     }
438 }
439
440 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
441 {
442     return nbv != nullptr && nbv->useGpu();
443 }
444
445 static inline void clear_rvecs_omp(int n, rvec v[])
446 {
447     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
448
449     /* Note that we would like to avoid this conditional by putting it
450      * into the omp pragma instead, but then we still take the full
451      * omp parallel for overhead (at least with gcc5).
452      */
453     if (nth == 1)
454     {
455         for (int i = 0; i < n; i++)
456         {
457             clear_rvec(v[i]);
458         }
459     }
460     else
461     {
462 #pragma omp parallel for num_threads(nth) schedule(static)
463         for (int i = 0; i < n; i++)
464         {
465             clear_rvec(v[i]);
466         }
467     }
468 }
469
470 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
471  *
472  * \param groupOptions  Group options, containing T-coupling options
473  */
474 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
475 {
476     real nrdfCoupled   = 0;
477     real nrdfUncoupled = 0;
478     real kineticEnergy = 0;
479     for (int g = 0; g < groupOptions.ngtc; g++)
480     {
481         if (groupOptions.tau_t[g] >= 0)
482         {
483             nrdfCoupled   += groupOptions.nrdf[g];
484             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
485         }
486         else
487         {
488             nrdfUncoupled += groupOptions.nrdf[g];
489         }
490     }
491
492     /* This conditional with > also catches nrdf=0 */
493     if (nrdfCoupled > nrdfUncoupled)
494     {
495         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
496     }
497     else
498     {
499         return 0;
500     }
501 }
502
503 /*! \brief This routine checks that the potential energy is finite.
504  *
505  * Always checks that the potential energy is finite. If step equals
506  * inputrec.init_step also checks that the magnitude of the potential energy
507  * is reasonable. Terminates with a fatal error when a check fails.
508  * Note that passing this check does not guarantee finite forces,
509  * since those use slightly different arithmetics. But in most cases
510  * there is just a narrow coordinate range where forces are not finite
511  * and energies are finite.
512  *
513  * \param[in] step      The step number, used for checking and printing
514  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
515  * \param[in] inputrec  The input record
516  */
517 static void checkPotentialEnergyValidity(int64_t               step,
518                                          const gmx_enerdata_t &enerd,
519                                          const t_inputrec     &inputrec)
520 {
521     /* Threshold valid for comparing absolute potential energy against
522      * the kinetic energy. Normally one should not consider absolute
523      * potential energy values, but with a factor of one million
524      * we should never get false positives.
525      */
526     constexpr real c_thresholdFactor = 1e6;
527
528     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
529     real           averageKineticEnergy = 0;
530     /* We only check for large potential energy at the initial step,
531      * because that is by far the most likely step for this too occur
532      * and because computing the average kinetic energy is not free.
533      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
534      * before they become NaN.
535      */
536     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
537     {
538         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
539     }
540
541     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
542                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
543     {
544         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
545                   step,
546                   enerd.term[F_EPOT],
547                   energyIsNotFinite ? "not finite" : "extremely high",
548                   enerd.term[F_LJ],
549                   enerd.term[F_COUL_SR],
550                   energyIsNotFinite ? "non-finite" : "very high",
551                   energyIsNotFinite ? " or Nan" : "");
552     }
553 }
554
555 /*! \brief Compute forces and/or energies for special algorithms
556  *
557  * The intention is to collect all calls to algorithms that compute
558  * forces on local atoms only and that do not contribute to the local
559  * virial sum (but add their virial contribution separately).
560  * Eventually these should likely all become ForceProviders.
561  * Within this function the intention is to have algorithms that do
562  * global communication at the end, so global barriers within the MD loop
563  * are as close together as possible.
564  *
565  * \param[in]     fplog            The log file
566  * \param[in]     cr               The communication record
567  * \param[in]     inputrec         The input record
568  * \param[in]     awh              The Awh module (nullptr if none in use).
569  * \param[in]     enforcedRotation Enforced rotation module.
570  * \param[in]     step             The current MD step
571  * \param[in]     t                The current time
572  * \param[in,out] wcycle           Wallcycle accounting struct
573  * \param[in,out] forceProviders   Pointer to a list of force providers
574  * \param[in]     box              The unit cell
575  * \param[in]     x                The coordinates
576  * \param[in]     mdatoms          Per atom properties
577  * \param[in]     lambda           Array of free-energy lambda values
578  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
579  * \param[in,out] forceWithVirial  Force and virial buffers
580  * \param[in,out] enerd            Energy buffer
581  * \param[in,out] ed               Essential dynamics pointer
582  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
583  *
584  * \todo Remove bNS, which is used incorrectly.
585  * \todo Convert all other algorithms called here to ForceProviders.
586  */
587 static void
588 computeSpecialForces(FILE                          *fplog,
589                      const t_commrec               *cr,
590                      const t_inputrec              *inputrec,
591                      gmx::Awh                      *awh,
592                      gmx_enfrot                    *enforcedRotation,
593                      int64_t                        step,
594                      double                         t,
595                      gmx_wallcycle_t                wcycle,
596                      ForceProviders                *forceProviders,
597                      matrix                         box,
598                      gmx::ArrayRef<const gmx::RVec> x,
599                      const t_mdatoms               *mdatoms,
600                      real                          *lambda,
601                      int                            forceFlags,
602                      gmx::ForceWithVirial          *forceWithVirial,
603                      gmx_enerdata_t                *enerd,
604                      gmx_edsam                     *ed,
605                      gmx_bool                       bNS)
606 {
607     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
608
609     /* NOTE: Currently all ForceProviders only provide forces.
610      *       When they also provide energies, remove this conditional.
611      */
612     if (computeForces)
613     {
614         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
615         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
616
617         /* Collect forces from modules */
618         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
619     }
620
621     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
622     {
623         pull_potential_wrapper(cr, inputrec, box, x,
624                                forceWithVirial,
625                                mdatoms, enerd, lambda, t,
626                                wcycle);
627
628         if (awh)
629         {
630             enerd->term[F_COM_PULL] +=
631                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
632                                                   forceWithVirial,
633                                                   t, step, wcycle, fplog);
634         }
635     }
636
637     rvec *f = as_rvec_array(forceWithVirial->force_.data());
638
639     /* Add the forces from enforced rotation potentials (if any) */
640     if (inputrec->bRot)
641     {
642         wallcycle_start(wcycle, ewcROTadd);
643         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
644         wallcycle_stop(wcycle, ewcROTadd);
645     }
646
647     if (ed)
648     {
649         /* Note that since init_edsam() is called after the initialization
650          * of forcerec, edsam doesn't request the noVirSum force buffer.
651          * Thus if no other algorithm (e.g. PME) requires it, the forces
652          * here will contribute to the virial.
653          */
654         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
655     }
656
657     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
658     if (inputrec->bIMD && computeForces)
659     {
660         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
661     }
662 }
663
664 /*! \brief Launch the prepare_step and spread stages of PME GPU.
665  *
666  * \param[in]  pmedata       The PME structure
667  * \param[in]  box           The box matrix
668  * \param[in]  x             Coordinate array
669  * \param[in]  flags         Force flags
670  * \param[in]  pmeFlags      PME flags
671  * \param[in]  wcycle        The wallcycle structure
672  */
673 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
674                                       matrix          box,
675                                       rvec            x[],
676                                       int             flags,
677                                       int             pmeFlags,
678                                       gmx_wallcycle_t wcycle)
679 {
680     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
681     pme_gpu_launch_spread(pmedata, x, wcycle);
682 }
683
684 /*! \brief Launch the FFT and gather stages of PME GPU
685  *
686  * This function only implements setting the output forces (no accumulation).
687  *
688  * \param[in]  pmedata        The PME structure
689  * \param[in]  wcycle         The wallcycle structure
690  */
691 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
692                                      gmx_wallcycle_t   wcycle)
693 {
694     pme_gpu_launch_complex_transforms(pmedata, wcycle);
695     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
696 }
697
698 /*! \brief
699  *  Polling wait for either of the PME or nonbonded GPU tasks.
700  *
701  * Instead of a static order in waiting for GPU tasks, this function
702  * polls checking which of the two tasks completes first, and does the
703  * associated force buffer reduction overlapped with the other task.
704  * By doing that, unlike static scheduling order, it can always overlap
705  * one of the reductions, regardless of the GPU task completion order.
706  *
707  * \param[in]     nbv              Nonbonded verlet structure
708  * \param[in,out] pmedata          PME module data
709  * \param[in,out] force            Force array to reduce task outputs into.
710  * \param[in,out] forceWithVirial  Force and virial buffers
711  * \param[in,out] fshift           Shift force output vector results are reduced into
712  * \param[in,out] enerd            Energy data structure results are reduced into
713  * \param[in]     flags            Force flags
714  * \param[in]     pmeFlags         PME flags
715  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
716  * \param[in]     wcycle           The wallcycle structure
717  */
718 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
719                                         gmx_pme_t                           *pmedata,
720                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
721                                         gmx::ForceWithVirial                *forceWithVirial,
722                                         rvec                                 fshift[],
723                                         gmx_enerdata_t                      *enerd,
724                                         int                                  flags,
725                                         int                                  pmeFlags,
726                                         bool                                 haveOtherWork,
727                                         gmx_wallcycle_t                      wcycle)
728 {
729     bool isPmeGpuDone = false;
730     bool isNbGpuDone  = false;
731
732
733     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
734
735     while (!isPmeGpuDone || !isNbGpuDone)
736     {
737         if (!isPmeGpuDone)
738         {
739             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
740             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
741         }
742
743         if (!isNbGpuDone)
744         {
745             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
746             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
747             isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
748                                                      flags,
749                                                      Nbnxm::AtomLocality::Local,
750                                                      haveOtherWork,
751                                                      enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
752                                                      fshift, completionType);
753             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
754             // To get the call count right, when the task finished we
755             // issue a start/stop.
756             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
757             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
758             if (isNbGpuDone)
759             {
760                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
761                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
762
763                 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
764                                               as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
765             }
766         }
767     }
768 }
769
770 /*! \brief
771  *  Launch the dynamic rolling pruning GPU task.
772  *
773  *  We currently alternate local/non-local list pruning in odd-even steps
774  *  (only pruning every second step without DD).
775  *
776  * \param[in]     cr               The communication record
777  * \param[in]     nbv              Nonbonded verlet structure
778  * \param[in]     inputrec         The input record
779  * \param[in]     step             The current MD step
780  */
781 static inline void launchGpuRollingPruning(const t_commrec          *cr,
782                                            const nonbonded_verlet_t *nbv,
783                                            const t_inputrec         *inputrec,
784                                            const int64_t             step)
785 {
786     /* We should not launch the rolling pruning kernel at a search
787      * step or just before search steps, since that's useless.
788      * Without domain decomposition we prune at even steps.
789      * With domain decomposition we alternate local and non-local
790      * pruning at even and odd steps.
791      */
792     int  numRollingParts     = nbv->pairlistSets().params().numRollingParts;
793     GMX_ASSERT(numRollingParts == nbv->pairlistSets().params().nstlistPrune/2,
794                "Since we alternate local/non-local at even/odd steps, "
795                "we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
796     int  stepWithCurrentList = nbv->pairlistSets().numStepsWithPairlist(step);
797     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
798     if (stepWithCurrentList > 0 &&
799         stepWithCurrentList < inputrec->nstlist - 1 &&
800         (stepIsEven || havePPDomainDecomposition(cr)))
801     {
802         Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
803                                            stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
804                                            numRollingParts);
805     }
806 }
807
808 static void do_force_cutsVERLET(FILE *fplog,
809                                 const t_commrec *cr,
810                                 const gmx_multisim_t *ms,
811                                 const t_inputrec *inputrec,
812                                 gmx::Awh *awh,
813                                 gmx_enfrot *enforcedRotation,
814                                 int64_t step,
815                                 t_nrnb *nrnb,
816                                 gmx_wallcycle_t wcycle,
817                                 const gmx_localtop_t *top,
818                                 const gmx_groups_t * /* groups */,
819                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
820                                 history_t *hist,
821                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
822                                 tensor vir_force,
823                                 const t_mdatoms *mdatoms,
824                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
825                                 real *lambda,
826                                 t_graph *graph,
827                                 t_forcerec *fr,
828                                 gmx::PpForceWorkload *ppForceWorkload,
829                                 interaction_const_t *ic,
830                                 const gmx_vsite_t *vsite,
831                                 rvec mu_tot,
832                                 double t,
833                                 gmx_edsam *ed,
834                                 const int flags,
835                                 const DDBalanceRegionHandler &ddBalanceRegionHandler)
836 {
837     int                 cg1, i, j;
838     double              mu[2*DIM];
839     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
840     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
841     rvec                vzero, box_diag;
842     float               cycles_pme, cycles_wait_gpu;
843     nonbonded_verlet_t *nbv = fr->nbv;
844
845     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
846     bNS           = ((flags & GMX_FORCE_NS) != 0);
847     bFillGrid     = (bNS && bStateChanged);
848     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
849     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
850     bUseGPU       = fr->nbv->useGpu();
851     bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
852
853     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
854     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
855     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
856         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
857     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
858         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
859         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
860         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
861
862     /* At a search step we need to start the first balancing region
863      * somewhere early inside the step after communication during domain
864      * decomposition (and not during the previous step as usual).
865      */
866     if (bNS)
867     {
868         ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
869     }
870
871     cycles_wait_gpu = 0;
872
873     const int start  = 0;
874     const int homenr = mdatoms->homenr;
875
876     clear_mat(vir_force);
877
878     if (DOMAINDECOMP(cr))
879     {
880         cg1 = cr->dd->globalAtomGroupIndices.size();
881     }
882     else
883     {
884         cg1 = top->cgs.nr;
885     }
886     if (fr->n_tpi > 0)
887     {
888         cg1--;
889     }
890
891     if (bStateChanged)
892     {
893         update_forcerec(fr, box);
894
895         if (inputrecNeedMutot(inputrec))
896         {
897             /* Calculate total (local) dipole moment in a temporary common array.
898              * This makes it possible to sum them over nodes faster.
899              */
900             calc_mu(start, homenr,
901                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
902                     mu, mu+DIM);
903         }
904     }
905
906     if (fr->ePBC != epbcNONE)
907     {
908         /* Compute shift vectors every step,
909          * because of pressure coupling or box deformation!
910          */
911         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
912         {
913             calc_shifts(box, fr->shift_vec);
914         }
915
916         if (bCalcCGCM)
917         {
918             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
919             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
920         }
921         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
922         {
923             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
924         }
925     }
926
927     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
928                                  fr->shift_vec, nbv->nbat);
929
930 #if GMX_MPI
931     if (!thisRankHasDuty(cr, DUTY_PME))
932     {
933         /* Send particle coordinates to the pme nodes.
934          * Since this is only implemented for domain decomposition
935          * and domain decomposition does not use the graph,
936          * we do not need to worry about shifting.
937          */
938         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
939                                  lambda[efptCOUL], lambda[efptVDW],
940                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
941                                  step, wcycle);
942     }
943 #endif /* GMX_MPI */
944
945     if (useGpuPme)
946     {
947         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
948     }
949
950     /* do gridding for pair search */
951     if (bNS)
952     {
953         if (graph && bStateChanged)
954         {
955             /* Calculate intramolecular shift vectors to make molecules whole */
956             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
957         }
958
959         clear_rvec(vzero);
960         box_diag[XX] = box[XX][XX];
961         box_diag[YY] = box[YY][YY];
962         box_diag[ZZ] = box[ZZ][ZZ];
963
964         wallcycle_start(wcycle, ewcNS);
965         if (!DOMAINDECOMP(cr))
966         {
967             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
968             nbnxn_put_on_grid(nbv, box,
969                               0, vzero, box_diag,
970                               nullptr, 0, mdatoms->homenr, -1,
971                               fr->cginfo, x.unpaddedArrayRef(),
972                               0, nullptr);
973             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
974         }
975         else
976         {
977             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
978             nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd),
979                                        fr->cginfo, x.unpaddedArrayRef());
980             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
981         }
982
983         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
984
985         wallcycle_stop(wcycle, ewcNS);
986     }
987
988     /* initialize the GPU atom data and copy shift vector */
989     if (bUseGPU)
990     {
991         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
992         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
993
994         if (bNS)
995         {
996             Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
997         }
998
999         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1000
1001         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1002
1003         if (bNS && fr->gpuBonded)
1004         {
1005             /* Now we put all atoms on the grid, we can assign bonded
1006              * interactions to the GPU, where the grid order is
1007              * needed. Also the xq, f and fshift device buffers have
1008              * been reallocated if needed, so the bonded code can
1009              * learn about them. */
1010             // TODO the xq, f, and fshift buffers are now shared
1011             // resources, so they should be maintained by a
1012             // higher-level object than the nb module.
1013             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1014                                                                   top->idef,
1015                                                                   Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1016                                                                   Nbnxm::gpu_get_f(nbv->gpu_nbv),
1017                                                                   Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1018             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1019         }
1020
1021         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1022     }
1023
1024     /* do local pair search */
1025     if (bNS)
1026     {
1027         wallcycle_start_nocount(wcycle, ewcNS);
1028         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1029         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1030         nbv->constructPairlist(Nbnxm::InteractionLocality::Local,
1031                                &top->excls, step, nrnb);
1032         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1033         wallcycle_stop(wcycle, ewcNS);
1034     }
1035     else
1036     {
1037         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1038                                         FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1039                                         nbv->nbat, wcycle);
1040     }
1041
1042     if (bUseGPU)
1043     {
1044         ddBalanceRegionHandler.openBeforeForceComputationGpu();
1045
1046         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1047
1048         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1049         Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1050         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1051
1052         // bonded work not split into separate local and non-local, so with DD
1053         // we can only launch the kernel after non-local coordinates have been received.
1054         if (ppForceWorkload->haveGpuBondedWork && !havePPDomainDecomposition(cr))
1055         {
1056             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1057             fr->gpuBonded->launchKernels(fr, flags, box);
1058             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1059         }
1060
1061         /* launch local nonbonded work on GPU */
1062         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1063         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1064                      step, nrnb, wcycle);
1065         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1066         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1067     }
1068
1069     if (useGpuPme)
1070     {
1071         // In PME GPU and mixed mode we launch FFT / gather after the
1072         // X copy/transform to allow overlap as well as after the GPU NB
1073         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1074         // the nonbonded kernel.
1075         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1076     }
1077
1078     /* Communicate coordinates and sum dipole if necessary +
1079        do non-local pair search */
1080     if (havePPDomainDecomposition(cr))
1081     {
1082         if (bNS)
1083         {
1084             wallcycle_start_nocount(wcycle, ewcNS);
1085             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1086             /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1087             nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal,
1088                                    &top->excls, step, nrnb);
1089             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1090             wallcycle_stop(wcycle, ewcNS);
1091         }
1092         else
1093         {
1094             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1095
1096             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1097                                             FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1098                                             nbv->nbat, wcycle);
1099         }
1100
1101         if (bUseGPU)
1102         {
1103             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1104
1105             /* launch non-local nonbonded tasks on GPU */
1106             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1107             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1108             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1109
1110             if (ppForceWorkload->haveGpuBondedWork)
1111             {
1112                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1113                 fr->gpuBonded->launchKernels(fr, flags, box);
1114                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1115             }
1116
1117             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1118             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1119                          step, nrnb, wcycle);
1120             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1121
1122             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1123         }
1124     }
1125
1126     if (bUseGPU)
1127     {
1128         /* launch D2H copy-back F */
1129         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1130         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1131         if (havePPDomainDecomposition(cr))
1132         {
1133             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1134                                       flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1135         }
1136         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1137                                   flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1138         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1139
1140         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1141         {
1142             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1143             fr->gpuBonded->launchEnergyTransfer();
1144             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1145         }
1146         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1147     }
1148
1149     if (bStateChanged && inputrecNeedMutot(inputrec))
1150     {
1151         if (PAR(cr))
1152         {
1153             gmx_sumd(2*DIM, mu, cr);
1154
1155             ddBalanceRegionHandler.reopenRegionCpu();
1156         }
1157
1158         for (i = 0; i < 2; i++)
1159         {
1160             for (j = 0; j < DIM; j++)
1161             {
1162                 fr->mu_tot[i][j] = mu[i*DIM + j];
1163             }
1164         }
1165     }
1166     if (fr->efep == efepNO)
1167     {
1168         copy_rvec(fr->mu_tot[0], mu_tot);
1169     }
1170     else
1171     {
1172         for (j = 0; j < DIM; j++)
1173         {
1174             mu_tot[j] =
1175                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1176                 lambda[efptCOUL]*fr->mu_tot[1][j];
1177         }
1178     }
1179
1180     /* Reset energies */
1181     reset_enerdata(enerd);
1182     clear_rvecs(SHIFTS, fr->fshift);
1183
1184     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1185     {
1186         wallcycle_start(wcycle, ewcPPDURINGPME);
1187         dd_force_flop_start(cr->dd, nrnb);
1188     }
1189
1190     if (inputrec->bRot)
1191     {
1192         wallcycle_start(wcycle, ewcROT);
1193         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1194         wallcycle_stop(wcycle, ewcROT);
1195     }
1196
1197     /* Temporary solution until all routines take PaddedRVecVector */
1198     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1199
1200     /* Start the force cycle counter.
1201      * Note that a different counter is used for dynamic load balancing.
1202      */
1203     wallcycle_start(wcycle, ewcFORCE);
1204
1205     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1206     if (bDoForces)
1207     {
1208         /* If we need to compute the virial, we might need a separate
1209          * force buffer for algorithms for which the virial is calculated
1210          * directly, such as PME.
1211          */
1212         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1213         {
1214             forceRef = *fr->forceBufferForDirectVirialContributions;
1215
1216             /* TODO: update comment
1217              * We only compute forces on local atoms. Note that vsites can
1218              * spread to non-local atoms, but that part of the buffer is
1219              * cleared separately in the vsite spreading code.
1220              */
1221             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1222         }
1223         /* Clear the short- and long-range forces */
1224         clear_rvecs_omp(fr->natoms_force_constr, f);
1225     }
1226
1227     /* forceWithVirial uses the local atom range only */
1228     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1229
1230     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1231     {
1232         clear_pull_forces(inputrec->pull_work);
1233     }
1234
1235     /* We calculate the non-bonded forces, when done on the CPU, here.
1236      * We do this before calling do_force_lowlevel, because in that
1237      * function, the listed forces are calculated before PME, which
1238      * does communication.  With this order, non-bonded and listed
1239      * force calculation imbalance can be balanced out by the domain
1240      * decomposition load balancing.
1241      */
1242
1243     if (!bUseOrEmulGPU)
1244     {
1245         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1246                      step, nrnb, wcycle);
1247     }
1248
1249     if (fr->efep != efepNO)
1250     {
1251         /* Calculate the local and non-local free energy interactions here.
1252          * Happens here on the CPU both with and without GPU.
1253          */
1254         wallcycle_sub_start(wcycle, ewcsNONBONDED);
1255         nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
1256                                       fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1257                                       inputrec->fepvals, lambda,
1258                                       enerd, flags, nrnb);
1259
1260         if (havePPDomainDecomposition(cr))
1261         {
1262             nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
1263                                           fr, as_rvec_array(x.unpaddedArrayRef().data()), f, *mdatoms,
1264                                           inputrec->fepvals, lambda,
1265                                           enerd, flags, nrnb);
1266         }
1267         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
1268     }
1269
1270     if (!bUseOrEmulGPU)
1271     {
1272         if (havePPDomainDecomposition(cr))
1273         {
1274             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1275                          step, nrnb, wcycle);
1276         }
1277
1278         /* Add all the non-bonded force to the normal force array.
1279          * This can be split into a local and a non-local part when overlapping
1280          * communication with calculation with domain decomposition.
1281          */
1282         wallcycle_stop(wcycle, ewcFORCE);
1283
1284         nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, f, wcycle);
1285
1286         wallcycle_start_nocount(wcycle, ewcFORCE);
1287
1288         /* If there are multiple fshift output buffers we need to reduce them */
1289         if (flags & GMX_FORCE_VIRIAL)
1290         {
1291             /* This is not in a subcounter because it takes a
1292                negligible and constant-sized amount of time */
1293             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1294                                                      fr->fshift);
1295         }
1296     }
1297
1298     /* update QMMMrec, if necessary */
1299     if (fr->bQMMM)
1300     {
1301         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1302     }
1303
1304     /* Compute the bonded and non-bonded energies and optionally forces */
1305     do_force_lowlevel(fr, inputrec, &(top->idef),
1306                       cr, ms, nrnb, wcycle, mdatoms,
1307                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1308                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1309                       flags,
1310                       &cycles_pme, ddBalanceRegionHandler);
1311
1312     wallcycle_stop(wcycle, ewcFORCE);
1313
1314     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1315                          step, t, wcycle,
1316                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1317                          flags, &forceWithVirial, enerd,
1318                          ed, bNS);
1319
1320     if (bUseOrEmulGPU)
1321     {
1322         /* wait for non-local forces (or calculate in emulation mode) */
1323         if (havePPDomainDecomposition(cr))
1324         {
1325             if (bUseGPU)
1326             {
1327                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1328                 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1329                                             flags, Nbnxm::AtomLocality::NonLocal,
1330                                             ppForceWorkload->haveGpuBondedWork,
1331                                             enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1332                                             fr->fshift);
1333                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1334             }
1335             else
1336             {
1337                 wallcycle_start_nocount(wcycle, ewcFORCE);
1338                 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1339                              step, nrnb, wcycle);
1340                 wallcycle_stop(wcycle, ewcFORCE);
1341             }
1342
1343             nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
1344                                           f, wcycle);
1345         }
1346     }
1347
1348     if (havePPDomainDecomposition(cr))
1349     {
1350         /* We are done with the CPU compute.
1351          * We will now communicate the non-local forces.
1352          * If we use a GPU this will overlap with GPU work, so in that case
1353          * we do not close the DD force balancing region here.
1354          */
1355         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1356
1357         if (bDoForces)
1358         {
1359             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1360         }
1361     }
1362
1363     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1364     // an alternating wait/reduction scheme.
1365     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1366     if (alternateGpuWait)
1367     {
1368         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1369     }
1370
1371     if (!alternateGpuWait && useGpuPme)
1372     {
1373         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1374     }
1375
1376     /* Wait for local GPU NB outputs on the non-alternating wait path */
1377     if (!alternateGpuWait && bUseGPU)
1378     {
1379         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1380          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1381          * but even with a step of 0.1 ms the difference is less than 1%
1382          * of the step time.
1383          */
1384         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1385
1386         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1387         Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1388                                     flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1389                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1390                                     fr->fshift);
1391         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1392
1393         if (ddBalanceRegionHandler.useBalancingRegion())
1394         {
1395             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1396             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1397             {
1398                 /* We measured few cycles, it could be that the kernel
1399                  * and transfer finished earlier and there was no actual
1400                  * wait time, only API call overhead.
1401                  * Then the actual time could be anywhere between 0 and
1402                  * cycles_wait_est. We will use half of cycles_wait_est.
1403                  */
1404                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1405             }
1406             ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1407         }
1408     }
1409
1410     if (fr->nbv->emulateGpu())
1411     {
1412         // NOTE: emulation kernel is not included in the balancing region,
1413         // but emulation mode does not target performance anyway
1414         wallcycle_start_nocount(wcycle, ewcFORCE);
1415         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1416                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1417                      step, nrnb, wcycle);
1418         wallcycle_stop(wcycle, ewcFORCE);
1419     }
1420
1421     if (useGpuPme)
1422     {
1423         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1424     }
1425
1426     if (bUseGPU)
1427     {
1428         /* now clear the GPU outputs while we finish the step on the CPU */
1429         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1430         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1431         Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1432
1433         /* Is dynamic pair-list pruning activated? */
1434         if (nbv->pairlistSets().params().useDynamicPruning)
1435         {
1436             launchGpuRollingPruning(cr, nbv, inputrec, step);
1437         }
1438         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1439         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1440     }
1441
1442     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1443     {
1444         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1445         // in principle this should be included in the DD balancing region,
1446         // but generally it is infrequent so we'll omit it for the sake of
1447         // simpler code
1448         fr->gpuBonded->accumulateEnergyTerms(enerd);
1449         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1450
1451         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1452         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1453         fr->gpuBonded->clearEnergies();
1454         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1455         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1456     }
1457
1458     /* Do the nonbonded GPU (or emulation) force buffer reduction
1459      * on the non-alternating path. */
1460     if (bUseOrEmulGPU && !alternateGpuWait)
1461     {
1462         nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
1463                                       f, wcycle);
1464     }
1465     if (DOMAINDECOMP(cr))
1466     {
1467         dd_force_flop_stop(cr->dd, nrnb);
1468     }
1469
1470     if (bDoForces)
1471     {
1472         /* If we have NoVirSum forces, but we do not calculate the virial,
1473          * we sum fr->f_novirsum=f later.
1474          */
1475         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1476         {
1477             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1478                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1479         }
1480
1481         if (flags & GMX_FORCE_VIRIAL)
1482         {
1483             /* Calculation of the virial must be done after vsites! */
1484             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1485                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1486         }
1487     }
1488
1489     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1490     {
1491         /* In case of node-splitting, the PP nodes receive the long-range
1492          * forces, virial and energy from the PME nodes here.
1493          */
1494         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1495     }
1496
1497     if (bDoForces)
1498     {
1499         post_process_forces(cr, step, nrnb, wcycle,
1500                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1501                             vir_force, mdatoms, graph, fr, vsite,
1502                             flags);
1503     }
1504
1505     if (flags & GMX_FORCE_ENERGY)
1506     {
1507         /* Sum the potential energy terms from group contributions */
1508         sum_epot(&(enerd->grpp), enerd->term);
1509
1510         if (!EI_TPI(inputrec->eI))
1511         {
1512             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1513         }
1514     }
1515 }
1516
1517 static void do_force_cutsGROUP(FILE *fplog,
1518                                const t_commrec *cr,
1519                                const gmx_multisim_t *ms,
1520                                const t_inputrec *inputrec,
1521                                gmx::Awh *awh,
1522                                gmx_enfrot *enforcedRotation,
1523                                int64_t step,
1524                                t_nrnb *nrnb,
1525                                gmx_wallcycle_t wcycle,
1526                                gmx_localtop_t *top,
1527                                const gmx_groups_t *groups,
1528                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1529                                history_t *hist,
1530                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1531                                tensor vir_force,
1532                                const t_mdatoms *mdatoms,
1533                                gmx_enerdata_t *enerd,
1534                                t_fcdata *fcd,
1535                                real *lambda,
1536                                t_graph *graph,
1537                                t_forcerec *fr,
1538                                const gmx_vsite_t *vsite,
1539                                rvec mu_tot,
1540                                double t,
1541                                gmx_edsam *ed,
1542                                int flags,
1543                                const DDBalanceRegionHandler &ddBalanceRegionHandler)
1544 {
1545     int        cg0, cg1, i, j;
1546     double     mu[2*DIM];
1547     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1548     gmx_bool   bDoForces;
1549     float      cycles_pme;
1550
1551     const int  start  = 0;
1552     const int  homenr = mdatoms->homenr;
1553
1554     clear_mat(vir_force);
1555
1556     cg0 = 0;
1557     if (DOMAINDECOMP(cr))
1558     {
1559         cg1 = cr->dd->globalAtomGroupIndices.size();
1560     }
1561     else
1562     {
1563         cg1 = top->cgs.nr;
1564     }
1565     if (fr->n_tpi > 0)
1566     {
1567         cg1--;
1568     }
1569
1570     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1571     bNS            = ((flags & GMX_FORCE_NS) != 0);
1572     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1573     bFillGrid      = (bNS && bStateChanged);
1574     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1575     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1576
1577     if (bStateChanged)
1578     {
1579         update_forcerec(fr, box);
1580
1581         if (inputrecNeedMutot(inputrec))
1582         {
1583             /* Calculate total (local) dipole moment in a temporary common array.
1584              * This makes it possible to sum them over nodes faster.
1585              */
1586             calc_mu(start, homenr,
1587                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1588                     mu, mu+DIM);
1589         }
1590     }
1591
1592     if (fr->ePBC != epbcNONE)
1593     {
1594         /* Compute shift vectors every step,
1595          * because of pressure coupling or box deformation!
1596          */
1597         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1598         {
1599             calc_shifts(box, fr->shift_vec);
1600         }
1601
1602         if (bCalcCGCM)
1603         {
1604             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1605                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1606             inc_nrnb(nrnb, eNR_CGCM, homenr);
1607             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1608         }
1609         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1610         {
1611             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1612         }
1613     }
1614     else if (bCalcCGCM)
1615     {
1616         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1617         inc_nrnb(nrnb, eNR_CGCM, homenr);
1618     }
1619
1620     if (bCalcCGCM && gmx_debug_at)
1621     {
1622         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1623     }
1624
1625 #if GMX_MPI
1626     if (!thisRankHasDuty(cr, DUTY_PME))
1627     {
1628         /* Send particle coordinates to the pme nodes.
1629          * Since this is only implemented for domain decomposition
1630          * and domain decomposition does not use the graph,
1631          * we do not need to worry about shifting.
1632          */
1633         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1634                                  lambda[efptCOUL], lambda[efptVDW],
1635                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1636                                  step, wcycle);
1637     }
1638 #endif /* GMX_MPI */
1639
1640     /* Communicate coordinates and sum dipole if necessary */
1641     if (DOMAINDECOMP(cr))
1642     {
1643         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1644
1645         /* No GPU support, no move_x overlap, so reopen the balance region here */
1646         ddBalanceRegionHandler.reopenRegionCpu();
1647     }
1648
1649     if (inputrecNeedMutot(inputrec))
1650     {
1651         if (bStateChanged)
1652         {
1653             if (PAR(cr))
1654             {
1655                 gmx_sumd(2*DIM, mu, cr);
1656
1657                 ddBalanceRegionHandler.reopenRegionCpu();
1658             }
1659             for (i = 0; i < 2; i++)
1660             {
1661                 for (j = 0; j < DIM; j++)
1662                 {
1663                     fr->mu_tot[i][j] = mu[i*DIM + j];
1664                 }
1665             }
1666         }
1667         if (fr->efep == efepNO)
1668         {
1669             copy_rvec(fr->mu_tot[0], mu_tot);
1670         }
1671         else
1672         {
1673             for (j = 0; j < DIM; j++)
1674             {
1675                 mu_tot[j] =
1676                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1677             }
1678         }
1679     }
1680
1681     /* Reset energies */
1682     reset_enerdata(enerd);
1683     clear_rvecs(SHIFTS, fr->fshift);
1684
1685     if (bNS)
1686     {
1687         wallcycle_start(wcycle, ewcNS);
1688
1689         if (graph && bStateChanged)
1690         {
1691             /* Calculate intramolecular shift vectors to make molecules whole */
1692             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1693         }
1694
1695         /* Do the actual neighbour searching */
1696         ns(fplog, fr, box,
1697            groups, top, mdatoms,
1698            cr, nrnb, bFillGrid);
1699
1700         wallcycle_stop(wcycle, ewcNS);
1701     }
1702
1703     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1704     {
1705         wallcycle_start(wcycle, ewcPPDURINGPME);
1706         dd_force_flop_start(cr->dd, nrnb);
1707     }
1708
1709     if (inputrec->bRot)
1710     {
1711         wallcycle_start(wcycle, ewcROT);
1712         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1713         wallcycle_stop(wcycle, ewcROT);
1714     }
1715
1716     /* Temporary solution until all routines take PaddedRVecVector */
1717     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1718
1719     /* Start the force cycle counter.
1720      * Note that a different counter is used for dynamic load balancing.
1721      */
1722     wallcycle_start(wcycle, ewcFORCE);
1723
1724     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1725     if (bDoForces)
1726     {
1727         /* If we need to compute the virial, we might need a separate
1728          * force buffer for algorithms for which the virial is calculated
1729          * separately, such as PME.
1730          */
1731         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1732         {
1733             forceRef = *fr->forceBufferForDirectVirialContributions;
1734
1735             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1736         }
1737
1738         /* Clear the short- and long-range forces */
1739         clear_rvecs(fr->natoms_force_constr, f);
1740     }
1741
1742     /* forceWithVirial might need the full force atom range */
1743     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1744
1745     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1746     {
1747         clear_pull_forces(inputrec->pull_work);
1748     }
1749
1750     /* update QMMMrec, if necessary */
1751     if (fr->bQMMM)
1752     {
1753         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1754     }
1755
1756     /* Compute the bonded and non-bonded energies and optionally forces */
1757     do_force_lowlevel(fr, inputrec, &(top->idef),
1758                       cr, ms, nrnb, wcycle, mdatoms,
1759                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1760                       box, inputrec->fepvals, lambda,
1761                       graph, &(top->excls), fr->mu_tot,
1762                       flags,
1763                       &cycles_pme, ddBalanceRegionHandler);
1764
1765     wallcycle_stop(wcycle, ewcFORCE);
1766
1767     if (DOMAINDECOMP(cr))
1768     {
1769         dd_force_flop_stop(cr->dd, nrnb);
1770
1771         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1772     }
1773
1774     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1775                          step, t, wcycle,
1776                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1777                          flags, &forceWithVirial, enerd,
1778                          ed, bNS);
1779
1780     if (bDoForces)
1781     {
1782         /* Communicate the forces */
1783         if (DOMAINDECOMP(cr))
1784         {
1785             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1786             /* Do we need to communicate the separate force array
1787              * for terms that do not contribute to the single sum virial?
1788              * Position restraints and electric fields do not introduce
1789              * inter-cg forces, only full electrostatics methods do.
1790              * When we do not calculate the virial, fr->f_novirsum = f,
1791              * so we have already communicated these forces.
1792              */
1793             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1794                 (flags & GMX_FORCE_VIRIAL))
1795             {
1796                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1797             }
1798         }
1799
1800         /* If we have NoVirSum forces, but we do not calculate the virial,
1801          * we sum fr->f_novirsum=f later.
1802          */
1803         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1804         {
1805             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1806                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1807         }
1808
1809         if (flags & GMX_FORCE_VIRIAL)
1810         {
1811             /* Calculation of the virial must be done after vsites! */
1812             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1813                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1814         }
1815     }
1816
1817     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1818     {
1819         /* In case of node-splitting, the PP nodes receive the long-range
1820          * forces, virial and energy from the PME nodes here.
1821          */
1822         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1823     }
1824
1825     if (bDoForces)
1826     {
1827         post_process_forces(cr, step, nrnb, wcycle,
1828                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1829                             vir_force, mdatoms, graph, fr, vsite,
1830                             flags);
1831     }
1832
1833     if (flags & GMX_FORCE_ENERGY)
1834     {
1835         /* Sum the potential energy terms from group contributions */
1836         sum_epot(&(enerd->grpp), enerd->term);
1837
1838         if (!EI_TPI(inputrec->eI))
1839         {
1840             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1841         }
1842     }
1843
1844 }
1845
1846 void do_force(FILE                                     *fplog,
1847               const t_commrec                          *cr,
1848               const gmx_multisim_t                     *ms,
1849               const t_inputrec                         *inputrec,
1850               gmx::Awh                                 *awh,
1851               gmx_enfrot                               *enforcedRotation,
1852               int64_t                                   step,
1853               t_nrnb                                   *nrnb,
1854               gmx_wallcycle_t                           wcycle,
1855               gmx_localtop_t                           *top,
1856               const gmx_groups_t                       *groups,
1857               matrix                                    box,
1858               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
1859               history_t                                *hist,
1860               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
1861               tensor                                    vir_force,
1862               const t_mdatoms                          *mdatoms,
1863               gmx_enerdata_t                           *enerd,
1864               t_fcdata                                 *fcd,
1865               gmx::ArrayRef<real>                       lambda,
1866               t_graph                                  *graph,
1867               t_forcerec                               *fr,
1868               gmx::PpForceWorkload                     *ppForceWorkload,
1869               const gmx_vsite_t                        *vsite,
1870               rvec                                      mu_tot,
1871               double                                    t,
1872               gmx_edsam                                *ed,
1873               int                                       flags,
1874               const DDBalanceRegionHandler             &ddBalanceRegionHandler)
1875 {
1876     /* modify force flag if not doing nonbonded */
1877     if (!fr->bNonbonded)
1878     {
1879         flags &= ~GMX_FORCE_NONBONDED;
1880     }
1881
1882     switch (inputrec->cutoff_scheme)
1883     {
1884         case ecutsVERLET:
1885             do_force_cutsVERLET(fplog, cr, ms, inputrec,
1886                                 awh, enforcedRotation, step, nrnb, wcycle,
1887                                 top,
1888                                 groups,
1889                                 box, x, hist,
1890                                 force, vir_force,
1891                                 mdatoms,
1892                                 enerd, fcd,
1893                                 lambda.data(), graph,
1894                                 fr,
1895                                 ppForceWorkload,
1896                                 fr->ic,
1897                                 vsite, mu_tot,
1898                                 t, ed,
1899                                 flags,
1900                                 ddBalanceRegionHandler);
1901             break;
1902         case ecutsGROUP:
1903             do_force_cutsGROUP(fplog, cr, ms, inputrec,
1904                                awh, enforcedRotation, step, nrnb, wcycle,
1905                                top,
1906                                groups,
1907                                box, x, hist,
1908                                force, vir_force,
1909                                mdatoms,
1910                                enerd, fcd,
1911                                lambda.data(), graph,
1912                                fr, vsite, mu_tot,
1913                                t, ed,
1914                                flags,
1915                                ddBalanceRegionHandler);
1916             break;
1917         default:
1918             gmx_incons("Invalid cut-off scheme passed!");
1919     }
1920
1921     /* In case we don't have constraints and are using GPUs, the next balancing
1922      * region starts here.
1923      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1924      * virial calculation and COM pulling, is not thus not included in
1925      * the balance timing, which is ok as most tasks do communication.
1926      */
1927     ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
1928 }
1929
1930
1931 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
1932                         const t_inputrec *ir, const t_mdatoms *md,
1933                         t_state *state)
1934 {
1935     int             i, m, start, end;
1936     int64_t         step;
1937     real            dt = ir->delta_t;
1938     real            dvdl_dum;
1939     rvec           *savex;
1940
1941     /* We need to allocate one element extra, since we might use
1942      * (unaligned) 4-wide SIMD loads to access rvec entries.
1943      */
1944     snew(savex, state->natoms + 1);
1945
1946     start = 0;
1947     end   = md->homenr;
1948
1949     if (debug)
1950     {
1951         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1952                 start, md->homenr, end);
1953     }
1954     /* Do a first constrain to reset particles... */
1955     step = ir->init_step;
1956     if (fplog)
1957     {
1958         char buf[STEPSTRSIZE];
1959         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1960                 gmx_step_str(step, buf));
1961     }
1962     dvdl_dum = 0;
1963
1964     /* constrain the current position */
1965     constr->apply(TRUE, FALSE,
1966                   step, 0, 1.0,
1967                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
1968                   state->box,
1969                   state->lambda[efptBONDED], &dvdl_dum,
1970                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
1971     if (EI_VV(ir->eI))
1972     {
1973         /* constrain the inital velocity, and save it */
1974         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1975         constr->apply(TRUE, FALSE,
1976                       step, 0, 1.0,
1977                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
1978                       state->box,
1979                       state->lambda[efptBONDED], &dvdl_dum,
1980                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
1981     }
1982     /* constrain the inital velocities at t-dt/2 */
1983     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1984     {
1985         auto x = makeArrayRef(state->x).subArray(start, end);
1986         auto v = makeArrayRef(state->v).subArray(start, end);
1987         for (i = start; (i < end); i++)
1988         {
1989             for (m = 0; (m < DIM); m++)
1990             {
1991                 /* Reverse the velocity */
1992                 v[i][m] = -v[i][m];
1993                 /* Store the position at t-dt in buf */
1994                 savex[i][m] = x[i][m] + dt*v[i][m];
1995             }
1996         }
1997         /* Shake the positions at t=-dt with the positions at t=0
1998          * as reference coordinates.
1999          */
2000         if (fplog)
2001         {
2002             char buf[STEPSTRSIZE];
2003             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2004                     gmx_step_str(step, buf));
2005         }
2006         dvdl_dum = 0;
2007         constr->apply(TRUE, FALSE,
2008                       step, -1, 1.0,
2009                       state->x.rvec_array(), savex, nullptr,
2010                       state->box,
2011                       state->lambda[efptBONDED], &dvdl_dum,
2012                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2013
2014         for (i = start; i < end; i++)
2015         {
2016             for (m = 0; m < DIM; m++)
2017             {
2018                 /* Re-reverse the velocities */
2019                 v[i][m] = -v[i][m];
2020             }
2021         }
2022     }
2023     sfree(savex);
2024 }
2025
2026
2027 static void
2028 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2029                 double *enerout, double *virout)
2030 {
2031     double enersum, virsum;
2032     double invscale, invscale2, invscale3;
2033     double r, ea, eb, ec, pa, pb, pc, pd;
2034     double y0, f, g, h;
2035     int    ri, offset;
2036     double tabfactor;
2037
2038     invscale  = 1.0/scale;
2039     invscale2 = invscale*invscale;
2040     invscale3 = invscale*invscale2;
2041
2042     /* Following summation derived from cubic spline definition,
2043      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2044      * the cubic spline.  We first calculate the negative of the
2045      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2046      * add the more standard, abrupt cutoff correction to that result,
2047      * yielding the long-range correction for a switched function.  We
2048      * perform both the pressure and energy loops at the same time for
2049      * simplicity, as the computational cost is low. */
2050
2051     if (offstart == 0)
2052     {
2053         /* Since the dispersion table has been scaled down a factor
2054          * 6.0 and the repulsion a factor 12.0 to compensate for the
2055          * c6/c12 parameters inside nbfp[] being scaled up (to save
2056          * flops in kernels), we need to correct for this.
2057          */
2058         tabfactor = 6.0;
2059     }
2060     else
2061     {
2062         tabfactor = 12.0;
2063     }
2064
2065     enersum = 0.0;
2066     virsum  = 0.0;
2067     for (ri = rstart; ri < rend; ++ri)
2068     {
2069         r  = ri*invscale;
2070         ea = invscale3;
2071         eb = 2.0*invscale2*r;
2072         ec = invscale*r*r;
2073
2074         pa = invscale3;
2075         pb = 3.0*invscale2*r;
2076         pc = 3.0*invscale*r*r;
2077         pd = r*r*r;
2078
2079         /* this "8" is from the packing in the vdwtab array - perhaps
2080            should be defined? */
2081
2082         offset = 8*ri + offstart;
2083         y0     = vdwtab[offset];
2084         f      = vdwtab[offset+1];
2085         g      = vdwtab[offset+2];
2086         h      = vdwtab[offset+3];
2087
2088         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2089         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2090     }
2091     *enerout = 4.0*M_PI*enersum*tabfactor;
2092     *virout  = 4.0*M_PI*virsum*tabfactor;
2093 }
2094
2095 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2096 {
2097     double   eners[2], virs[2], enersum, virsum;
2098     double   r0, rc3, rc9;
2099     int      ri0, ri1, i;
2100     real     scale, *vdwtab;
2101
2102     fr->enershiftsix    = 0;
2103     fr->enershifttwelve = 0;
2104     fr->enerdiffsix     = 0;
2105     fr->enerdifftwelve  = 0;
2106     fr->virdiffsix      = 0;
2107     fr->virdifftwelve   = 0;
2108
2109     const interaction_const_t *ic = fr->ic;
2110
2111     if (eDispCorr != edispcNO)
2112     {
2113         for (i = 0; i < 2; i++)
2114         {
2115             eners[i] = 0;
2116             virs[i]  = 0;
2117         }
2118         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2119             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2120             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2121             (ic->vdwtype == evdwSHIFT) ||
2122             (ic->vdwtype == evdwSWITCH))
2123         {
2124             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2125                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2126                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2127             {
2128                 gmx_fatal(FARGS,
2129                           "With dispersion correction rvdw-switch can not be zero "
2130                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2131             }
2132
2133             /* TODO This code depends on the logic in tables.c that
2134                constructs the table layout, which should be made
2135                explicit in future cleanup. */
2136             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2137                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2138             scale  = fr->dispersionCorrectionTable->scale;
2139             vdwtab = fr->dispersionCorrectionTable->data;
2140
2141             /* Round the cut-offs to exact table values for precision */
2142             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2143             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2144
2145             /* The code below has some support for handling force-switching, i.e.
2146              * when the force (instead of potential) is switched over a limited
2147              * region. This leads to a constant shift in the potential inside the
2148              * switching region, which we can handle by adding a constant energy
2149              * term in the force-switch case just like when we do potential-shift.
2150              *
2151              * For now this is not enabled, but to keep the functionality in the
2152              * code we check separately for switch and shift. When we do force-switch
2153              * the shifting point is rvdw_switch, while it is the cutoff when we
2154              * have a classical potential-shift.
2155              *
2156              * For a pure potential-shift the potential has a constant shift
2157              * all the way out to the cutoff, and that is it. For other forms
2158              * we need to calculate the constant shift up to the point where we
2159              * start modifying the potential.
2160              */
2161             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2162
2163             r0   = ri0/scale;
2164             rc3  = r0*r0*r0;
2165             rc9  = rc3*rc3*rc3;
2166
2167             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2168                 (ic->vdwtype == evdwSHIFT))
2169             {
2170                 /* Determine the constant energy shift below rvdw_switch.
2171                  * Table has a scale factor since we have scaled it down to compensate
2172                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2173                  */
2174                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2175                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2176             }
2177             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2178             {
2179                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2180                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2181             }
2182
2183             /* Add the constant part from 0 to rvdw_switch.
2184              * This integration from 0 to rvdw_switch overcounts the number
2185              * of interactions by 1, as it also counts the self interaction.
2186              * We will correct for this later.
2187              */
2188             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2189             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2190
2191             /* Calculate the contribution in the range [r0,r1] where we
2192              * modify the potential. For a pure potential-shift modifier we will
2193              * have ri0==ri1, and there will not be any contribution here.
2194              */
2195             for (i = 0; i < 2; i++)
2196             {
2197                 enersum = 0;
2198                 virsum  = 0;
2199                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2200                 eners[i] -= enersum;
2201                 virs[i]  -= virsum;
2202             }
2203
2204             /* Alright: Above we compensated by REMOVING the parts outside r0
2205              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2206              *
2207              * Regardless of whether r0 is the point where we start switching,
2208              * or the cutoff where we calculated the constant shift, we include
2209              * all the parts we are missing out to infinity from r0 by
2210              * calculating the analytical dispersion correction.
2211              */
2212             eners[0] += -4.0*M_PI/(3.0*rc3);
2213             eners[1] +=  4.0*M_PI/(9.0*rc9);
2214             virs[0]  +=  8.0*M_PI/rc3;
2215             virs[1]  += -16.0*M_PI/(3.0*rc9);
2216         }
2217         else if (ic->vdwtype == evdwCUT ||
2218                  EVDW_PME(ic->vdwtype) ||
2219                  ic->vdwtype == evdwUSER)
2220         {
2221             if (ic->vdwtype == evdwUSER && fplog)
2222             {
2223                 fprintf(fplog,
2224                         "WARNING: using dispersion correction with user tables\n");
2225             }
2226
2227             /* Note that with LJ-PME, the dispersion correction is multiplied
2228              * by the difference between the actual C6 and the value of C6
2229              * that would produce the combination rule.
2230              * This means the normal energy and virial difference formulas
2231              * can be used here.
2232              */
2233
2234             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2235             rc9  = rc3*rc3*rc3;
2236             /* Contribution beyond the cut-off */
2237             eners[0] += -4.0*M_PI/(3.0*rc3);
2238             eners[1] +=  4.0*M_PI/(9.0*rc9);
2239             if (ic->vdw_modifier == eintmodPOTSHIFT)
2240             {
2241                 /* Contribution within the cut-off */
2242                 eners[0] += -4.0*M_PI/(3.0*rc3);
2243                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2244             }
2245             /* Contribution beyond the cut-off */
2246             virs[0]  +=  8.0*M_PI/rc3;
2247             virs[1]  += -16.0*M_PI/(3.0*rc9);
2248         }
2249         else
2250         {
2251             gmx_fatal(FARGS,
2252                       "Dispersion correction is not implemented for vdw-type = %s",
2253                       evdw_names[ic->vdwtype]);
2254         }
2255
2256         /* When we deprecate the group kernels the code below can go too */
2257         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2258         {
2259             /* Calculate self-interaction coefficient (assuming that
2260              * the reciprocal-space contribution is constant in the
2261              * region that contributes to the self-interaction).
2262              */
2263             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2264
2265             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2266             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2267         }
2268
2269         fr->enerdiffsix    = eners[0];
2270         fr->enerdifftwelve = eners[1];
2271         /* The 0.5 is due to the Gromacs definition of the virial */
2272         fr->virdiffsix     = 0.5*virs[0];
2273         fr->virdifftwelve  = 0.5*virs[1];
2274     }
2275 }
2276
2277 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2278                    const matrix box, real lambda, tensor pres, tensor virial,
2279                    real *prescorr, real *enercorr, real *dvdlcorr)
2280 {
2281     gmx_bool bCorrAll, bCorrPres;
2282     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2283     int      m;
2284
2285     *prescorr = 0;
2286     *enercorr = 0;
2287     *dvdlcorr = 0;
2288
2289     clear_mat(virial);
2290     clear_mat(pres);
2291
2292     if (ir->eDispCorr != edispcNO)
2293     {
2294         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2295                      ir->eDispCorr == edispcAllEnerPres);
2296         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2297                      ir->eDispCorr == edispcAllEnerPres);
2298
2299         invvol = 1/det(box);
2300         if (fr->n_tpi)
2301         {
2302             /* Only correct for the interactions with the inserted molecule */
2303             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2304             ninter = fr->n_tpi;
2305         }
2306         else
2307         {
2308             dens   = fr->numAtomsForDispersionCorrection*invvol;
2309             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2310         }
2311
2312         if (ir->efep == efepNO)
2313         {
2314             avcsix    = fr->avcsix[0];
2315             avctwelve = fr->avctwelve[0];
2316         }
2317         else
2318         {
2319             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2320             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2321         }
2322
2323         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2324         *enercorr += avcsix*enerdiff;
2325         dvdlambda  = 0.0;
2326         if (ir->efep != efepNO)
2327         {
2328             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2329         }
2330         if (bCorrAll)
2331         {
2332             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2333             *enercorr += avctwelve*enerdiff;
2334             if (fr->efep != efepNO)
2335             {
2336                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2337             }
2338         }
2339
2340         if (bCorrPres)
2341         {
2342             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2343             if (ir->eDispCorr == edispcAllEnerPres)
2344             {
2345                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2346             }
2347             /* The factor 2 is because of the Gromacs virial definition */
2348             spres = -2.0*invvol*svir*PRESFAC;
2349
2350             for (m = 0; m < DIM; m++)
2351             {
2352                 virial[m][m] += svir;
2353                 pres[m][m]   += spres;
2354             }
2355             *prescorr += spres;
2356         }
2357
2358         /* Can't currently control when it prints, for now, just print when degugging */
2359         if (debug)
2360         {
2361             if (bCorrAll)
2362             {
2363                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2364                         avcsix, avctwelve);
2365             }
2366             if (bCorrPres)
2367             {
2368                 fprintf(debug,
2369                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2370                         *enercorr, spres, svir);
2371             }
2372             else
2373             {
2374                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2375             }
2376         }
2377
2378         if (fr->efep != efepNO)
2379         {
2380             *dvdlcorr += dvdlambda;
2381         }
2382     }
2383 }
2384
2385 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2386                             const gmx_mtop_t *mtop, rvec x[],
2387                             gmx_bool bFirst)
2388 {
2389     t_graph        *graph;
2390     int             as, mol;
2391
2392     if (bFirst && fplog)
2393     {
2394         fprintf(fplog, "Removing pbc first time\n");
2395     }
2396
2397     snew(graph, 1);
2398     as = 0;
2399     for (const gmx_molblock_t &molb : mtop->molblock)
2400     {
2401         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2402         if (moltype.atoms.nr == 1 ||
2403             (!bFirst && moltype.cgs.nr == 1))
2404         {
2405             /* Just one atom or charge group in the molecule, no PBC required */
2406             as += molb.nmol*moltype.atoms.nr;
2407         }
2408         else
2409         {
2410             mk_graph_moltype(moltype, graph);
2411
2412             for (mol = 0; mol < molb.nmol; mol++)
2413             {
2414                 mk_mshift(fplog, graph, ePBC, box, x+as);
2415
2416                 shift_self(graph, box, x+as);
2417                 /* The molecule is whole now.
2418                  * We don't need the second mk_mshift call as in do_pbc_first,
2419                  * since we no longer need this graph.
2420                  */
2421
2422                 as += moltype.atoms.nr;
2423             }
2424             done_graph(graph);
2425         }
2426     }
2427     sfree(graph);
2428 }
2429
2430 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2431                        const gmx_mtop_t *mtop, rvec x[])
2432 {
2433     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2434 }
2435
2436 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2437                  const gmx_mtop_t *mtop, rvec x[])
2438 {
2439     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2440 }
2441
2442 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2443 {
2444     int t, nth;
2445     nth = gmx_omp_nthreads_get(emntDefault);
2446
2447 #pragma omp parallel for num_threads(nth) schedule(static)
2448     for (t = 0; t < nth; t++)
2449     {
2450         try
2451         {
2452             size_t natoms = x.size();
2453             size_t offset = (natoms*t    )/nth;
2454             size_t len    = (natoms*(t + 1))/nth - offset;
2455             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2456         }
2457         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2458     }
2459 }
2460
2461 // TODO This can be cleaned up a lot, and move back to runner.cpp
2462 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2463                 const t_inputrec *inputrec,
2464                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2465                 gmx_walltime_accounting_t walltime_accounting,
2466                 nonbonded_verlet_t *nbv,
2467                 const gmx_pme_t *pme,
2468                 gmx_bool bWriteStat)
2469 {
2470     t_nrnb *nrnb_tot = nullptr;
2471     double  delta_t  = 0;
2472     double  nbfs     = 0, mflop = 0;
2473     double  elapsed_time,
2474             elapsed_time_over_all_ranks,
2475             elapsed_time_over_all_threads,
2476             elapsed_time_over_all_threads_over_all_ranks;
2477     /* Control whether it is valid to print a report. Only the
2478        simulation master may print, but it should not do so if the run
2479        terminated e.g. before a scheduled reset step. This is
2480        complicated by the fact that PME ranks are unaware of the
2481        reason why they were sent a pmerecvqxFINISH. To avoid
2482        communication deadlocks, we always do the communication for the
2483        report, even if we've decided not to write the report, because
2484        how long it takes to finish the run is not important when we've
2485        decided not to report on the simulation performance.
2486
2487        Further, we only report performance for dynamical integrators,
2488        because those are the only ones for which we plan to
2489        consider doing any optimizations. */
2490     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2491
2492     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2493     {
2494         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2495         printReport = false;
2496     }
2497
2498     if (cr->nnodes > 1)
2499     {
2500         snew(nrnb_tot, 1);
2501 #if GMX_MPI
2502         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2503                       cr->mpi_comm_mysim);
2504 #endif
2505     }
2506     else
2507     {
2508         nrnb_tot = nrnb;
2509     }
2510
2511     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2512     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2513     if (cr->nnodes > 1)
2514     {
2515 #if GMX_MPI
2516         /* reduce elapsed_time over all MPI ranks in the current simulation */
2517         MPI_Allreduce(&elapsed_time,
2518                       &elapsed_time_over_all_ranks,
2519                       1, MPI_DOUBLE, MPI_SUM,
2520                       cr->mpi_comm_mysim);
2521         elapsed_time_over_all_ranks /= cr->nnodes;
2522         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2523          * current simulation. */
2524         MPI_Allreduce(&elapsed_time_over_all_threads,
2525                       &elapsed_time_over_all_threads_over_all_ranks,
2526                       1, MPI_DOUBLE, MPI_SUM,
2527                       cr->mpi_comm_mysim);
2528 #endif
2529     }
2530     else
2531     {
2532         elapsed_time_over_all_ranks                  = elapsed_time;
2533         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2534     }
2535
2536     if (printReport)
2537     {
2538         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2539     }
2540     if (cr->nnodes > 1)
2541     {
2542         sfree(nrnb_tot);
2543     }
2544
2545     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2546     {
2547         print_dd_statistics(cr, inputrec, fplog);
2548     }
2549
2550     /* TODO Move the responsibility for any scaling by thread counts
2551      * to the code that handled the thread region, so that there's a
2552      * mechanism to keep cycle counting working during the transition
2553      * to task parallelism. */
2554     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2555     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2556     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2557     auto cycle_sum(wallcycle_sum(cr, wcycle));
2558
2559     if (printReport)
2560     {
2561         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2562         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2563         if (pme_gpu_task_enabled(pme))
2564         {
2565             pme_gpu_get_timings(pme, &pme_gpu_timings);
2566         }
2567         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2568                         elapsed_time_over_all_ranks,
2569                         wcycle, cycle_sum,
2570                         nbnxn_gpu_timings,
2571                         &pme_gpu_timings);
2572
2573         if (EI_DYNAMICS(inputrec->eI))
2574         {
2575             delta_t = inputrec->delta_t;
2576         }
2577
2578         if (fplog)
2579         {
2580             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2581                        elapsed_time_over_all_ranks,
2582                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2583                        delta_t, nbfs, mflop);
2584         }
2585         if (bWriteStat)
2586         {
2587             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2588                        elapsed_time_over_all_ranks,
2589                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2590                        delta_t, nbfs, mflop);
2591         }
2592     }
2593 }
2594
2595 void initialize_lambdas(FILE               *fplog,
2596                         const t_inputrec   &ir,
2597                         bool                isMaster,
2598                         int                *fep_state,
2599                         gmx::ArrayRef<real> lambda,
2600                         double             *lam0)
2601 {
2602     /* TODO: Clean up initialization of fep_state and lambda in
2603        t_state.  This function works, but could probably use a logic
2604        rewrite to keep all the different types of efep straight. */
2605
2606     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2607     {
2608         return;
2609     }
2610
2611     const t_lambda *fep = ir.fepvals;
2612     if (isMaster)
2613     {
2614         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2615                                              if checkpoint is set -- a kludge is in for now
2616                                              to prevent this.*/
2617     }
2618
2619     for (int i = 0; i < efptNR; i++)
2620     {
2621         double thisLambda;
2622         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2623         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2624         {
2625             thisLambda = fep->init_lambda;
2626         }
2627         else
2628         {
2629             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2630         }
2631         if (isMaster)
2632         {
2633             lambda[i] = thisLambda;
2634         }
2635         if (lam0 != nullptr)
2636         {
2637             lam0[i] = thisLambda;
2638         }
2639     }
2640     if (ir.bSimTemp)
2641     {
2642         /* need to rescale control temperatures to match current state */
2643         for (int i = 0; i < ir.opts.ngtc; i++)
2644         {
2645             if (ir.opts.ref_t[i] > 0)
2646             {
2647                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2648             }
2649         }
2650     }
2651
2652     /* Send to the log the information on the current lambdas */
2653     if (fplog != nullptr)
2654     {
2655         fprintf(fplog, "Initial vector of lambda components:[ ");
2656         for (const auto &l : lambda)
2657         {
2658             fprintf(fplog, "%10.4f ", l);
2659         }
2660         fprintf(fplog, "]\n");
2661     }
2662 }