Simplify make_pairlist() call signature
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
47
48 #include <array>
49
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
117
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
123
124
125 void print_time(FILE                     *out,
126                 gmx_walltime_accounting_t walltime_accounting,
127                 int64_t                   step,
128                 const t_inputrec         *ir,
129                 const t_commrec          *cr)
130 {
131     time_t finish;
132     double dt, elapsed_seconds, time_per_step;
133
134 #if !GMX_THREAD_MPI
135     if (!PAR(cr))
136 #endif
137     {
138         fprintf(out, "\r");
139     }
140     fputs("step ", out);
141     fputs(gmx::int64ToString(step).c_str(), out);
142     fflush(out);
143
144     if ((step >= ir->nstlist))
145     {
146         double seconds_since_epoch = gmx_gettime();
147         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
149         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
150
151         if (ir->nsteps >= 0)
152         {
153             if (dt >= 300)
154             {
155                 finish = static_cast<time_t>(seconds_since_epoch + dt);
156                 auto timebuf = gmx_ctime_r(&finish);
157                 timebuf.erase(timebuf.find_first_of('\n'));
158                 fputs(", will finish ", out);
159                 fputs(timebuf.c_str(), out);
160             }
161             else
162             {
163                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
164             }
165         }
166         else
167         {
168             fprintf(out, " performance: %.1f ns/day    ",
169                     ir->delta_t/1000*24*60*60/time_per_step);
170         }
171     }
172 #if !GMX_THREAD_MPI
173     if (PAR(cr))
174     {
175         fprintf(out, "\n");
176     }
177 #else
178     GMX_UNUSED_VALUE(cr);
179 #endif
180
181     fflush(out);
182 }
183
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185                          double the_time)
186 {
187     if (!fplog)
188     {
189         return;
190     }
191
192     time_t temp_time = static_cast<time_t>(the_time);
193
194     auto   timebuf = gmx_ctime_r(&temp_time);
195
196     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
197 }
198
199 void print_start(FILE *fplog, const t_commrec *cr,
200                  gmx_walltime_accounting_t walltime_accounting,
201                  const char *name)
202 {
203     char buf[STRLEN];
204
205     sprintf(buf, "Started %s", name);
206     print_date_and_time(fplog, cr->nodeid, buf,
207                         walltime_accounting_get_start_time_stamp(walltime_accounting));
208 }
209
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
211 {
212     const int      end = forceToAdd.size();
213
214     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216     for (int i = 0; i < end; i++)
217     {
218         rvec_inc(f[i], forceToAdd[i]);
219     }
220 }
221
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223                         tensor vir_part, const t_graph *graph, const matrix box,
224                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
225 {
226     /* The short-range virial from surrounding boxes */
227     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
229
230     /* Calculate partial virial, for local atoms only, based on short range.
231      * Total virial is computed in global_stat, called from do_md
232      */
233     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
235
236     if (debug)
237     {
238         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
239     }
240 }
241
242 static void pull_potential_wrapper(const t_commrec *cr,
243                                    const t_inputrec *ir,
244                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245                                    gmx::ForceWithVirial *force,
246                                    const t_mdatoms *mdatoms,
247                                    gmx_enerdata_t *enerd,
248                                    const real *lambda,
249                                    double t,
250                                    gmx_wallcycle_t wcycle)
251 {
252     t_pbc  pbc;
253     real   dvdl;
254
255     /* Calculate the center of mass forces, this requires communication,
256      * which is why pull_potential is called close to other communication.
257      */
258     wallcycle_start(wcycle, ewcPULLPOT);
259     set_pbc(&pbc, ir->ePBC, box);
260     dvdl                     = 0;
261     enerd->term[F_COM_PULL] +=
262         pull_potential(ir->pull_work, mdatoms, &pbc,
263                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265     wallcycle_stop(wcycle, ewcPULLPOT);
266 }
267
268 static void pme_receive_force_ener(const t_commrec      *cr,
269                                    gmx::ForceWithVirial *forceWithVirial,
270                                    gmx_enerdata_t       *enerd,
271                                    gmx_wallcycle_t       wcycle)
272 {
273     real   e_q, e_lj, dvdl_q, dvdl_lj;
274     float  cycles_ppdpme, cycles_seppme;
275
276     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
278
279     /* In case of node-splitting, the PP nodes receive the long-range
280      * forces, virial and energy from the PME nodes here.
281      */
282     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
283     dvdl_q  = 0;
284     dvdl_lj = 0;
285     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
286                       &cycles_seppme);
287     enerd->term[F_COUL_RECIP] += e_q;
288     enerd->term[F_LJ_RECIP]   += e_lj;
289     enerd->dvdl_lin[efptCOUL] += dvdl_q;
290     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
291
292     if (wcycle)
293     {
294         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
295     }
296     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
297 }
298
299 static void print_large_forces(FILE            *fp,
300                                const t_mdatoms *md,
301                                const t_commrec *cr,
302                                int64_t          step,
303                                real             forceTolerance,
304                                const rvec      *x,
305                                const rvec      *f)
306 {
307     real           force2Tolerance = gmx::square(forceTolerance);
308     gmx::index     numNonFinite    = 0;
309     for (int i = 0; i < md->homenr; i++)
310     {
311         real force2    = norm2(f[i]);
312         bool nonFinite = !std::isfinite(force2);
313         if (force2 >= force2Tolerance || nonFinite)
314         {
315             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
316                     step,
317                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
318         }
319         if (nonFinite)
320         {
321             numNonFinite++;
322         }
323     }
324     if (numNonFinite > 0)
325     {
326         /* Note that with MPI this fatal call on one rank might interrupt
327          * the printing on other ranks. But we can only avoid that with
328          * an expensive MPI barrier that we would need at each step.
329          */
330         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
331     }
332 }
333
334 static void post_process_forces(const t_commrec           *cr,
335                                 int64_t                    step,
336                                 t_nrnb                    *nrnb,
337                                 gmx_wallcycle_t            wcycle,
338                                 const gmx_localtop_t      *top,
339                                 const matrix               box,
340                                 const rvec                 x[],
341                                 rvec                       f[],
342                                 gmx::ForceWithVirial      *forceWithVirial,
343                                 tensor                     vir_force,
344                                 const t_mdatoms           *mdatoms,
345                                 const t_graph             *graph,
346                                 const t_forcerec          *fr,
347                                 const gmx_vsite_t         *vsite,
348                                 int                        flags)
349 {
350     if (fr->haveDirectVirialContributions)
351     {
352         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
353
354         if (vsite)
355         {
356             /* Spread the mesh force on virtual sites to the other particles...
357              * This is parallellized. MPI communication is performed
358              * if the constructing atoms aren't local.
359              */
360             matrix virial = { { 0 } };
361             spread_vsite_f(vsite, x, fDirectVir, nullptr,
362                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
363                            nrnb,
364                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365             forceWithVirial->addVirialContribution(virial);
366         }
367
368         if (flags & GMX_FORCE_VIRIAL)
369         {
370             /* Now add the forces, this is local */
371             sum_forces(f, forceWithVirial->force_);
372
373             /* Add the direct virial contributions */
374             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
376
377             if (debug)
378             {
379                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
380             }
381         }
382     }
383
384     if (fr->print_force >= 0)
385     {
386         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
387     }
388 }
389
390 static void do_nb_verlet(t_forcerec                       *fr,
391                          const interaction_const_t        *ic,
392                          gmx_enerdata_t                   *enerd,
393                          const int                         flags,
394                          const Nbnxm::InteractionLocality  ilocality,
395                          const int                         clearF,
396                          const int64_t                     step,
397                          t_nrnb                           *nrnb,
398                          gmx_wallcycle_t                   wcycle)
399 {
400     if (!(flags & GMX_FORCE_NONBONDED))
401     {
402         /* skip non-bonded calculation */
403         return;
404     }
405
406     nonbonded_verlet_t *nbv  = fr->nbv;
407
408     /* GPU kernel launch overhead is already timed separately */
409     if (fr->cutoff_scheme != ecutsVERLET)
410     {
411         gmx_incons("Invalid cut-off scheme passed!");
412     }
413
414     if (!nbv->useGpu())
415     {
416         /* When dynamic pair-list  pruning is requested, we need to prune
417          * at nstlistPrune steps.
418          */
419         if (nbv->listParams->useDynamicPruning &&
420             nbnxnIsDynamicPairlistPruningStep(*nbv, ilocality, step))
421         {
422             /* Prune the pair-list beyond fr->ic->rlistPrune using
423              * the current coordinates of the atoms.
424              */
425             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
426             NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
427             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
428         }
429
430         wallcycle_sub_start(wcycle, ewcsNONBONDED);
431     }
432
433     NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
434
435     if (!nbv->useGpu())
436     {
437         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
438     }
439 }
440
441 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
442                              t_forcerec           *fr,
443                              rvec                  x[],
444                              rvec                  f[],
445                              const t_mdatoms      *mdatoms,
446                              t_lambda             *fepvals,
447                              real                 *lambda,
448                              gmx_enerdata_t       *enerd,
449                              int                   flags,
450                              t_nrnb               *nrnb,
451                              gmx_wallcycle_t       wcycle)
452 {
453     int              donb_flags;
454     nb_kernel_data_t kernel_data;
455     real             lam_i[efptNR];
456     real             dvdl_nb[efptNR];
457     int              th;
458     int              i, j;
459
460     donb_flags = 0;
461     /* Add short-range interactions */
462     donb_flags |= GMX_NONBONDED_DO_SR;
463
464     /* Currently all group scheme kernels always calculate (shift-)forces */
465     if (flags & GMX_FORCE_FORCES)
466     {
467         donb_flags |= GMX_NONBONDED_DO_FORCE;
468     }
469     if (flags & GMX_FORCE_VIRIAL)
470     {
471         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
472     }
473     if (flags & GMX_FORCE_ENERGY)
474     {
475         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
476     }
477
478     kernel_data.flags  = donb_flags;
479     kernel_data.lambda = lambda;
480     kernel_data.dvdl   = dvdl_nb;
481
482     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
483     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
484
485     /* reset free energy components */
486     for (i = 0; i < efptNR; i++)
487     {
488         dvdl_nb[i]  = 0;
489     }
490
491     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
492
493     wallcycle_sub_start(wcycle, ewcsNONBONDED);
494 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
495     for (th = 0; th < nbl_lists->nnbl; th++)
496     {
497         try
498         {
499             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
500                                       x, f, fr, mdatoms, &kernel_data, nrnb);
501         }
502         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
503     }
504
505     if (fepvals->sc_alpha != 0)
506     {
507         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
508         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
509     }
510     else
511     {
512         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
513         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
514     }
515
516     /* If we do foreign lambda and we have soft-core interactions
517      * we have to recalculate the (non-linear) energies contributions.
518      */
519     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
520     {
521         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
522         kernel_data.lambda         = lam_i;
523         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
524         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
525         /* Note that we add to kernel_data.dvdl, but ignore the result */
526
527         for (i = 0; i < enerd->n_lambda; i++)
528         {
529             for (j = 0; j < efptNR; j++)
530             {
531                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
532             }
533             reset_foreign_enerdata(enerd);
534 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
535             for (th = 0; th < nbl_lists->nnbl; th++)
536             {
537                 try
538                 {
539                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
540                                               x, f, fr, mdatoms, &kernel_data, nrnb);
541                 }
542                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
543             }
544
545             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
546             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
547         }
548     }
549
550     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
551 }
552
553 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
554 {
555     return nbv != nullptr && nbv->useGpu();
556 }
557
558 static inline void clear_rvecs_omp(int n, rvec v[])
559 {
560     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
561
562     /* Note that we would like to avoid this conditional by putting it
563      * into the omp pragma instead, but then we still take the full
564      * omp parallel for overhead (at least with gcc5).
565      */
566     if (nth == 1)
567     {
568         for (int i = 0; i < n; i++)
569         {
570             clear_rvec(v[i]);
571         }
572     }
573     else
574     {
575 #pragma omp parallel for num_threads(nth) schedule(static)
576         for (int i = 0; i < n; i++)
577         {
578             clear_rvec(v[i]);
579         }
580     }
581 }
582
583 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
584  *
585  * \param groupOptions  Group options, containing T-coupling options
586  */
587 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
588 {
589     real nrdfCoupled   = 0;
590     real nrdfUncoupled = 0;
591     real kineticEnergy = 0;
592     for (int g = 0; g < groupOptions.ngtc; g++)
593     {
594         if (groupOptions.tau_t[g] >= 0)
595         {
596             nrdfCoupled   += groupOptions.nrdf[g];
597             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
598         }
599         else
600         {
601             nrdfUncoupled += groupOptions.nrdf[g];
602         }
603     }
604
605     /* This conditional with > also catches nrdf=0 */
606     if (nrdfCoupled > nrdfUncoupled)
607     {
608         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
609     }
610     else
611     {
612         return 0;
613     }
614 }
615
616 /*! \brief This routine checks that the potential energy is finite.
617  *
618  * Always checks that the potential energy is finite. If step equals
619  * inputrec.init_step also checks that the magnitude of the potential energy
620  * is reasonable. Terminates with a fatal error when a check fails.
621  * Note that passing this check does not guarantee finite forces,
622  * since those use slightly different arithmetics. But in most cases
623  * there is just a narrow coordinate range where forces are not finite
624  * and energies are finite.
625  *
626  * \param[in] step      The step number, used for checking and printing
627  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
628  * \param[in] inputrec  The input record
629  */
630 static void checkPotentialEnergyValidity(int64_t               step,
631                                          const gmx_enerdata_t &enerd,
632                                          const t_inputrec     &inputrec)
633 {
634     /* Threshold valid for comparing absolute potential energy against
635      * the kinetic energy. Normally one should not consider absolute
636      * potential energy values, but with a factor of one million
637      * we should never get false positives.
638      */
639     constexpr real c_thresholdFactor = 1e6;
640
641     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
642     real           averageKineticEnergy = 0;
643     /* We only check for large potential energy at the initial step,
644      * because that is by far the most likely step for this too occur
645      * and because computing the average kinetic energy is not free.
646      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
647      * before they become NaN.
648      */
649     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
650     {
651         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
652     }
653
654     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
655                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
656     {
657         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
658                   step,
659                   enerd.term[F_EPOT],
660                   energyIsNotFinite ? "not finite" : "extremely high",
661                   enerd.term[F_LJ],
662                   enerd.term[F_COUL_SR],
663                   energyIsNotFinite ? "non-finite" : "very high",
664                   energyIsNotFinite ? " or Nan" : "");
665     }
666 }
667
668 /*! \brief Compute forces and/or energies for special algorithms
669  *
670  * The intention is to collect all calls to algorithms that compute
671  * forces on local atoms only and that do not contribute to the local
672  * virial sum (but add their virial contribution separately).
673  * Eventually these should likely all become ForceProviders.
674  * Within this function the intention is to have algorithms that do
675  * global communication at the end, so global barriers within the MD loop
676  * are as close together as possible.
677  *
678  * \param[in]     fplog            The log file
679  * \param[in]     cr               The communication record
680  * \param[in]     inputrec         The input record
681  * \param[in]     awh              The Awh module (nullptr if none in use).
682  * \param[in]     enforcedRotation Enforced rotation module.
683  * \param[in]     step             The current MD step
684  * \param[in]     t                The current time
685  * \param[in,out] wcycle           Wallcycle accounting struct
686  * \param[in,out] forceProviders   Pointer to a list of force providers
687  * \param[in]     box              The unit cell
688  * \param[in]     x                The coordinates
689  * \param[in]     mdatoms          Per atom properties
690  * \param[in]     lambda           Array of free-energy lambda values
691  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
692  * \param[in,out] forceWithVirial  Force and virial buffers
693  * \param[in,out] enerd            Energy buffer
694  * \param[in,out] ed               Essential dynamics pointer
695  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
696  *
697  * \todo Remove bNS, which is used incorrectly.
698  * \todo Convert all other algorithms called here to ForceProviders.
699  */
700 static void
701 computeSpecialForces(FILE                          *fplog,
702                      const t_commrec               *cr,
703                      const t_inputrec              *inputrec,
704                      gmx::Awh                      *awh,
705                      gmx_enfrot                    *enforcedRotation,
706                      int64_t                        step,
707                      double                         t,
708                      gmx_wallcycle_t                wcycle,
709                      ForceProviders                *forceProviders,
710                      matrix                         box,
711                      gmx::ArrayRef<const gmx::RVec> x,
712                      const t_mdatoms               *mdatoms,
713                      real                          *lambda,
714                      int                            forceFlags,
715                      gmx::ForceWithVirial          *forceWithVirial,
716                      gmx_enerdata_t                *enerd,
717                      gmx_edsam                     *ed,
718                      gmx_bool                       bNS)
719 {
720     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
721
722     /* NOTE: Currently all ForceProviders only provide forces.
723      *       When they also provide energies, remove this conditional.
724      */
725     if (computeForces)
726     {
727         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
728         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
729
730         /* Collect forces from modules */
731         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
732     }
733
734     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
735     {
736         pull_potential_wrapper(cr, inputrec, box, x,
737                                forceWithVirial,
738                                mdatoms, enerd, lambda, t,
739                                wcycle);
740
741         if (awh)
742         {
743             enerd->term[F_COM_PULL] +=
744                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
745                                                   forceWithVirial,
746                                                   t, step, wcycle, fplog);
747         }
748     }
749
750     rvec *f = as_rvec_array(forceWithVirial->force_.data());
751
752     /* Add the forces from enforced rotation potentials (if any) */
753     if (inputrec->bRot)
754     {
755         wallcycle_start(wcycle, ewcROTadd);
756         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
757         wallcycle_stop(wcycle, ewcROTadd);
758     }
759
760     if (ed)
761     {
762         /* Note that since init_edsam() is called after the initialization
763          * of forcerec, edsam doesn't request the noVirSum force buffer.
764          * Thus if no other algorithm (e.g. PME) requires it, the forces
765          * here will contribute to the virial.
766          */
767         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
768     }
769
770     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
771     if (inputrec->bIMD && computeForces)
772     {
773         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
774     }
775 }
776
777 /*! \brief Launch the prepare_step and spread stages of PME GPU.
778  *
779  * \param[in]  pmedata       The PME structure
780  * \param[in]  box           The box matrix
781  * \param[in]  x             Coordinate array
782  * \param[in]  flags         Force flags
783  * \param[in]  pmeFlags      PME flags
784  * \param[in]  wcycle        The wallcycle structure
785  */
786 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
787                                       matrix          box,
788                                       rvec            x[],
789                                       int             flags,
790                                       int             pmeFlags,
791                                       gmx_wallcycle_t wcycle)
792 {
793     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
794     pme_gpu_launch_spread(pmedata, x, wcycle);
795 }
796
797 /*! \brief Launch the FFT and gather stages of PME GPU
798  *
799  * This function only implements setting the output forces (no accumulation).
800  *
801  * \param[in]  pmedata        The PME structure
802  * \param[in]  wcycle         The wallcycle structure
803  */
804 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
805                                      gmx_wallcycle_t   wcycle)
806 {
807     pme_gpu_launch_complex_transforms(pmedata, wcycle);
808     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
809 }
810
811 /*! \brief
812  *  Polling wait for either of the PME or nonbonded GPU tasks.
813  *
814  * Instead of a static order in waiting for GPU tasks, this function
815  * polls checking which of the two tasks completes first, and does the
816  * associated force buffer reduction overlapped with the other task.
817  * By doing that, unlike static scheduling order, it can always overlap
818  * one of the reductions, regardless of the GPU task completion order.
819  *
820  * \param[in]     nbv              Nonbonded verlet structure
821  * \param[in,out] pmedata          PME module data
822  * \param[in,out] force            Force array to reduce task outputs into.
823  * \param[in,out] forceWithVirial  Force and virial buffers
824  * \param[in,out] fshift           Shift force output vector results are reduced into
825  * \param[in,out] enerd            Energy data structure results are reduced into
826  * \param[in]     flags            Force flags
827  * \param[in]     pmeFlags         PME flags
828  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
829  * \param[in]     wcycle           The wallcycle structure
830  */
831 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
832                                         gmx_pme_t                           *pmedata,
833                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
834                                         gmx::ForceWithVirial                *forceWithVirial,
835                                         rvec                                 fshift[],
836                                         gmx_enerdata_t                      *enerd,
837                                         int                                  flags,
838                                         int                                  pmeFlags,
839                                         bool                                 haveOtherWork,
840                                         gmx_wallcycle_t                      wcycle)
841 {
842     bool isPmeGpuDone = false;
843     bool isNbGpuDone  = false;
844
845
846     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
847
848     while (!isPmeGpuDone || !isNbGpuDone)
849     {
850         if (!isPmeGpuDone)
851         {
852             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
853             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
854         }
855
856         if (!isNbGpuDone)
857         {
858             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
859             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
860             isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
861                                                      flags,
862                                                      Nbnxm::AtomLocality::Local,
863                                                      haveOtherWork,
864                                                      enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
865                                                      fshift, completionType);
866             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
867             // To get the call count right, when the task finished we
868             // issue a start/stop.
869             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
870             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
871             if (isNbGpuDone)
872             {
873                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
874                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
875
876                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
877                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
878             }
879         }
880     }
881 }
882
883 /*! \brief
884  *  Launch the dynamic rolling pruning GPU task.
885  *
886  *  We currently alternate local/non-local list pruning in odd-even steps
887  *  (only pruning every second step without DD).
888  *
889  * \param[in]     cr               The communication record
890  * \param[in]     nbv              Nonbonded verlet structure
891  * \param[in]     inputrec         The input record
892  * \param[in]     step             The current MD step
893  */
894 static inline void launchGpuRollingPruning(const t_commrec          *cr,
895                                            const nonbonded_verlet_t *nbv,
896                                            const t_inputrec         *inputrec,
897                                            const int64_t             step)
898 {
899     /* We should not launch the rolling pruning kernel at a search
900      * step or just before search steps, since that's useless.
901      * Without domain decomposition we prune at even steps.
902      * With domain decomposition we alternate local and non-local
903      * pruning at even and odd steps.
904      */
905     int  numRollingParts     = nbv->listParams->numRollingParts;
906     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
907     int  stepWithCurrentList = nbnxnNumStepsWithPairlist(*nbv, Nbnxm::InteractionLocality::Local, step);
908     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
909     if (stepWithCurrentList > 0 &&
910         stepWithCurrentList < inputrec->nstlist - 1 &&
911         (stepIsEven || havePPDomainDecomposition(cr)))
912     {
913         Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
914                                            stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
915                                            numRollingParts);
916     }
917 }
918
919 static void do_force_cutsVERLET(FILE *fplog,
920                                 const t_commrec *cr,
921                                 const gmx_multisim_t *ms,
922                                 const t_inputrec *inputrec,
923                                 gmx::Awh *awh,
924                                 gmx_enfrot *enforcedRotation,
925                                 int64_t step,
926                                 t_nrnb *nrnb,
927                                 gmx_wallcycle_t wcycle,
928                                 const gmx_localtop_t *top,
929                                 const gmx_groups_t * /* groups */,
930                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
931                                 history_t *hist,
932                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
933                                 tensor vir_force,
934                                 const t_mdatoms *mdatoms,
935                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
936                                 real *lambda,
937                                 t_graph *graph,
938                                 t_forcerec *fr,
939                                 gmx::PpForceWorkload *ppForceWorkload,
940                                 interaction_const_t *ic,
941                                 const gmx_vsite_t *vsite,
942                                 rvec mu_tot,
943                                 double t,
944                                 gmx_edsam *ed,
945                                 const int flags,
946                                 const DDBalanceRegionHandler &ddBalanceRegionHandler)
947 {
948     int                 cg1, i, j;
949     double              mu[2*DIM];
950     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
951     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
952     rvec                vzero, box_diag;
953     float               cycles_pme, cycles_wait_gpu;
954     nonbonded_verlet_t *nbv = fr->nbv;
955
956     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
957     bNS           = ((flags & GMX_FORCE_NS) != 0);
958     bFillGrid     = (bNS && bStateChanged);
959     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
960     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
961     bUseGPU       = fr->nbv->useGpu();
962     bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
963
964     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
965     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
966     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
967         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
968     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
969         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
970         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
971         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
972
973     /* At a search step we need to start the first balancing region
974      * somewhere early inside the step after communication during domain
975      * decomposition (and not during the previous step as usual).
976      */
977     if (bNS)
978     {
979         ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
980     }
981
982     cycles_wait_gpu = 0;
983
984     const int start  = 0;
985     const int homenr = mdatoms->homenr;
986
987     clear_mat(vir_force);
988
989     if (DOMAINDECOMP(cr))
990     {
991         cg1 = cr->dd->globalAtomGroupIndices.size();
992     }
993     else
994     {
995         cg1 = top->cgs.nr;
996     }
997     if (fr->n_tpi > 0)
998     {
999         cg1--;
1000     }
1001
1002     if (bStateChanged)
1003     {
1004         update_forcerec(fr, box);
1005
1006         if (inputrecNeedMutot(inputrec))
1007         {
1008             /* Calculate total (local) dipole moment in a temporary common array.
1009              * This makes it possible to sum them over nodes faster.
1010              */
1011             calc_mu(start, homenr,
1012                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1013                     mu, mu+DIM);
1014         }
1015     }
1016
1017     if (fr->ePBC != epbcNONE)
1018     {
1019         /* Compute shift vectors every step,
1020          * because of pressure coupling or box deformation!
1021          */
1022         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1023         {
1024             calc_shifts(box, fr->shift_vec);
1025         }
1026
1027         if (bCalcCGCM)
1028         {
1029             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1030             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1031         }
1032         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1033         {
1034             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1035         }
1036     }
1037
1038     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1039                                  fr->shift_vec, nbv->nbat);
1040
1041 #if GMX_MPI
1042     if (!thisRankHasDuty(cr, DUTY_PME))
1043     {
1044         /* Send particle coordinates to the pme nodes.
1045          * Since this is only implemented for domain decomposition
1046          * and domain decomposition does not use the graph,
1047          * we do not need to worry about shifting.
1048          */
1049         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1050                                  lambda[efptCOUL], lambda[efptVDW],
1051                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1052                                  step, wcycle);
1053     }
1054 #endif /* GMX_MPI */
1055
1056     if (useGpuPme)
1057     {
1058         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1059     }
1060
1061     /* do gridding for pair search */
1062     if (bNS)
1063     {
1064         if (graph && bStateChanged)
1065         {
1066             /* Calculate intramolecular shift vectors to make molecules whole */
1067             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1068         }
1069
1070         clear_rvec(vzero);
1071         box_diag[XX] = box[XX][XX];
1072         box_diag[YY] = box[YY][YY];
1073         box_diag[ZZ] = box[ZZ][ZZ];
1074
1075         wallcycle_start(wcycle, ewcNS);
1076         if (!DOMAINDECOMP(cr))
1077         {
1078             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1079             nbnxn_put_on_grid(nbv, box,
1080                               0, vzero, box_diag,
1081                               nullptr, 0, mdatoms->homenr, -1,
1082                               fr->cginfo, x.unpaddedArrayRef(),
1083                               0, nullptr);
1084             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1085         }
1086         else
1087         {
1088             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1089             nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd),
1090                                        fr->cginfo, x.unpaddedArrayRef());
1091             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1092         }
1093
1094         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1095
1096         wallcycle_stop(wcycle, ewcNS);
1097     }
1098
1099     /* initialize the GPU atom data and copy shift vector */
1100     if (bUseGPU)
1101     {
1102         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1103         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1104
1105         if (bNS)
1106         {
1107             Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1108         }
1109
1110         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1111
1112         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1113
1114         if (bNS && fr->gpuBonded)
1115         {
1116             /* Now we put all atoms on the grid, we can assign bonded
1117              * interactions to the GPU, where the grid order is
1118              * needed. Also the xq, f and fshift device buffers have
1119              * been reallocated if needed, so the bonded code can
1120              * learn about them. */
1121             // TODO the xq, f, and fshift buffers are now shared
1122             // resources, so they should be maintained by a
1123             // higher-level object than the nb module.
1124             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1125                                                                   top->idef,
1126                                                                   Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1127                                                                   Nbnxm::gpu_get_f(nbv->gpu_nbv),
1128                                                                   Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1129             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1130         }
1131
1132         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1133     }
1134
1135     /* do local pair search */
1136     if (bNS)
1137     {
1138         wallcycle_start_nocount(wcycle, ewcNS);
1139         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1140         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1141         nbnxn_make_pairlist(nbv, Nbnxm::InteractionLocality::Local,
1142                             &top->excls, step, nrnb);
1143         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1144         wallcycle_stop(wcycle, ewcNS);
1145     }
1146     else
1147     {
1148         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1149                                         FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1150                                         nbv->nbat, wcycle);
1151     }
1152
1153     if (bUseGPU)
1154     {
1155         ddBalanceRegionHandler.openBeforeForceComputationGpu();
1156
1157         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1158
1159         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1160         Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1161         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1162
1163         // bonded work not split into separate local and non-local, so with DD
1164         // we can only launch the kernel after non-local coordinates have been received.
1165         if (ppForceWorkload->haveGpuBondedWork && !havePPDomainDecomposition(cr))
1166         {
1167             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1168             fr->gpuBonded->launchKernels(fr, flags, box);
1169             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1170         }
1171
1172         /* launch local nonbonded work on GPU */
1173         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1174         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1175                      step, nrnb, wcycle);
1176         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1177         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1178     }
1179
1180     if (useGpuPme)
1181     {
1182         // In PME GPU and mixed mode we launch FFT / gather after the
1183         // X copy/transform to allow overlap as well as after the GPU NB
1184         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1185         // the nonbonded kernel.
1186         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1187     }
1188
1189     /* Communicate coordinates and sum dipole if necessary +
1190        do non-local pair search */
1191     if (havePPDomainDecomposition(cr))
1192     {
1193         if (bNS)
1194         {
1195             wallcycle_start_nocount(wcycle, ewcNS);
1196             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1197             /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1198             nbnxn_make_pairlist(nbv, Nbnxm::InteractionLocality::NonLocal,
1199                                 &top->excls, step, nrnb);
1200             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1201             wallcycle_stop(wcycle, ewcNS);
1202         }
1203         else
1204         {
1205             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1206
1207             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1208                                             FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1209                                             nbv->nbat, wcycle);
1210         }
1211
1212         if (bUseGPU)
1213         {
1214             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1215
1216             /* launch non-local nonbonded tasks on GPU */
1217             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1218             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1219             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1220
1221             if (ppForceWorkload->haveGpuBondedWork)
1222             {
1223                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1224                 fr->gpuBonded->launchKernels(fr, flags, box);
1225                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1226             }
1227
1228             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1229             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1230                          step, nrnb, wcycle);
1231             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1232
1233             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1234         }
1235     }
1236
1237     if (bUseGPU)
1238     {
1239         /* launch D2H copy-back F */
1240         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1241         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1242         if (havePPDomainDecomposition(cr))
1243         {
1244             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1245                                       flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1246         }
1247         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1248                                   flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1249         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1250
1251         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1252         {
1253             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1254             fr->gpuBonded->launchEnergyTransfer();
1255             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1256         }
1257         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1258     }
1259
1260     if (bStateChanged && inputrecNeedMutot(inputrec))
1261     {
1262         if (PAR(cr))
1263         {
1264             gmx_sumd(2*DIM, mu, cr);
1265
1266             ddBalanceRegionHandler.reopenRegionCpu();
1267         }
1268
1269         for (i = 0; i < 2; i++)
1270         {
1271             for (j = 0; j < DIM; j++)
1272             {
1273                 fr->mu_tot[i][j] = mu[i*DIM + j];
1274             }
1275         }
1276     }
1277     if (fr->efep == efepNO)
1278     {
1279         copy_rvec(fr->mu_tot[0], mu_tot);
1280     }
1281     else
1282     {
1283         for (j = 0; j < DIM; j++)
1284         {
1285             mu_tot[j] =
1286                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1287                 lambda[efptCOUL]*fr->mu_tot[1][j];
1288         }
1289     }
1290
1291     /* Reset energies */
1292     reset_enerdata(enerd);
1293     clear_rvecs(SHIFTS, fr->fshift);
1294
1295     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1296     {
1297         wallcycle_start(wcycle, ewcPPDURINGPME);
1298         dd_force_flop_start(cr->dd, nrnb);
1299     }
1300
1301     if (inputrec->bRot)
1302     {
1303         wallcycle_start(wcycle, ewcROT);
1304         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1305         wallcycle_stop(wcycle, ewcROT);
1306     }
1307
1308     /* Temporary solution until all routines take PaddedRVecVector */
1309     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1310
1311     /* Start the force cycle counter.
1312      * Note that a different counter is used for dynamic load balancing.
1313      */
1314     wallcycle_start(wcycle, ewcFORCE);
1315
1316     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1317     if (bDoForces)
1318     {
1319         /* If we need to compute the virial, we might need a separate
1320          * force buffer for algorithms for which the virial is calculated
1321          * directly, such as PME.
1322          */
1323         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1324         {
1325             forceRef = *fr->forceBufferForDirectVirialContributions;
1326
1327             /* TODO: update comment
1328              * We only compute forces on local atoms. Note that vsites can
1329              * spread to non-local atoms, but that part of the buffer is
1330              * cleared separately in the vsite spreading code.
1331              */
1332             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1333         }
1334         /* Clear the short- and long-range forces */
1335         clear_rvecs_omp(fr->natoms_force_constr, f);
1336     }
1337
1338     /* forceWithVirial uses the local atom range only */
1339     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1340
1341     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1342     {
1343         clear_pull_forces(inputrec->pull_work);
1344     }
1345
1346     /* We calculate the non-bonded forces, when done on the CPU, here.
1347      * We do this before calling do_force_lowlevel, because in that
1348      * function, the listed forces are calculated before PME, which
1349      * does communication.  With this order, non-bonded and listed
1350      * force calculation imbalance can be balanced out by the domain
1351      * decomposition load balancing.
1352      */
1353
1354     if (!bUseOrEmulGPU)
1355     {
1356         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1357                      step, nrnb, wcycle);
1358     }
1359
1360     if (fr->efep != efepNO)
1361     {
1362         /* Calculate the local and non-local free energy interactions here.
1363          * Happens here on the CPU both with and without GPU.
1364          */
1365         if (fr->nbv->pairlistSets[Nbnxm::InteractionLocality::Local].nbl_fep[0]->nrj > 0)
1366         {
1367             do_nb_verlet_fep(&fr->nbv->pairlistSets[Nbnxm::InteractionLocality::Local],
1368                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1369                              inputrec->fepvals, lambda,
1370                              enerd, flags, nrnb, wcycle);
1371         }
1372
1373         if (DOMAINDECOMP(cr) &&
1374             fr->nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal].nbl_fep[0]->nrj > 0)
1375         {
1376             do_nb_verlet_fep(&fr->nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal],
1377                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1378                              inputrec->fepvals, lambda,
1379                              enerd, flags, nrnb, wcycle);
1380         }
1381     }
1382
1383     if (!bUseOrEmulGPU)
1384     {
1385         if (havePPDomainDecomposition(cr))
1386         {
1387             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1388                          step, nrnb, wcycle);
1389         }
1390
1391         const Nbnxm::InteractionLocality iloc =
1392             (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1393
1394         /* Add all the non-bonded force to the normal force array.
1395          * This can be split into a local and a non-local part when overlapping
1396          * communication with calculation with domain decomposition.
1397          */
1398         wallcycle_stop(wcycle, ewcFORCE);
1399
1400         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1401
1402         wallcycle_start_nocount(wcycle, ewcFORCE);
1403
1404         /* if there are multiple fshift output buffers reduce them */
1405         if ((flags & GMX_FORCE_VIRIAL) &&
1406             nbv->pairlistSets[iloc].nnbl > 1)
1407         {
1408             /* This is not in a subcounter because it takes a
1409                negligible and constant-sized amount of time */
1410             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1411                                                      fr->fshift);
1412         }
1413     }
1414
1415     /* update QMMMrec, if necessary */
1416     if (fr->bQMMM)
1417     {
1418         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1419     }
1420
1421     /* Compute the bonded and non-bonded energies and optionally forces */
1422     do_force_lowlevel(fr, inputrec, &(top->idef),
1423                       cr, ms, nrnb, wcycle, mdatoms,
1424                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1425                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1426                       flags,
1427                       &cycles_pme, ddBalanceRegionHandler);
1428
1429     wallcycle_stop(wcycle, ewcFORCE);
1430
1431     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1432                          step, t, wcycle,
1433                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1434                          flags, &forceWithVirial, enerd,
1435                          ed, bNS);
1436
1437     if (bUseOrEmulGPU)
1438     {
1439         /* wait for non-local forces (or calculate in emulation mode) */
1440         if (havePPDomainDecomposition(cr))
1441         {
1442             if (bUseGPU)
1443             {
1444                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1445                 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1446                                             flags, Nbnxm::AtomLocality::NonLocal,
1447                                             ppForceWorkload->haveGpuBondedWork,
1448                                             enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1449                                             fr->fshift);
1450                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1451             }
1452             else
1453             {
1454                 wallcycle_start_nocount(wcycle, ewcFORCE);
1455                 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1456                              step, nrnb, wcycle);
1457                 wallcycle_stop(wcycle, ewcFORCE);
1458             }
1459
1460             /* skip the reduction if there was no non-local work to do */
1461             if (!nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal].nblGpu[0]->sci.empty())
1462             {
1463                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1464                                                nbv->nbat, f, wcycle);
1465             }
1466         }
1467     }
1468
1469     if (havePPDomainDecomposition(cr))
1470     {
1471         /* We are done with the CPU compute.
1472          * We will now communicate the non-local forces.
1473          * If we use a GPU this will overlap with GPU work, so in that case
1474          * we do not close the DD force balancing region here.
1475          */
1476         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1477
1478         if (bDoForces)
1479         {
1480             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1481         }
1482     }
1483
1484     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1485     // an alternating wait/reduction scheme.
1486     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1487     if (alternateGpuWait)
1488     {
1489         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1490     }
1491
1492     if (!alternateGpuWait && useGpuPme)
1493     {
1494         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1495     }
1496
1497     /* Wait for local GPU NB outputs on the non-alternating wait path */
1498     if (!alternateGpuWait && bUseGPU)
1499     {
1500         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1501          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1502          * but even with a step of 0.1 ms the difference is less than 1%
1503          * of the step time.
1504          */
1505         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1506
1507         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1508         Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1509                                     flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1510                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1511                                     fr->fshift);
1512         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1513
1514         if (ddBalanceRegionHandler.useBalancingRegion())
1515         {
1516             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1517             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1518             {
1519                 /* We measured few cycles, it could be that the kernel
1520                  * and transfer finished earlier and there was no actual
1521                  * wait time, only API call overhead.
1522                  * Then the actual time could be anywhere between 0 and
1523                  * cycles_wait_est. We will use half of cycles_wait_est.
1524                  */
1525                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1526             }
1527             ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1528         }
1529     }
1530
1531     if (fr->nbv->emulateGpu())
1532     {
1533         // NOTE: emulation kernel is not included in the balancing region,
1534         // but emulation mode does not target performance anyway
1535         wallcycle_start_nocount(wcycle, ewcFORCE);
1536         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1537                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1538                      step, nrnb, wcycle);
1539         wallcycle_stop(wcycle, ewcFORCE);
1540     }
1541
1542     if (useGpuPme)
1543     {
1544         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1545     }
1546
1547     if (bUseGPU)
1548     {
1549         /* now clear the GPU outputs while we finish the step on the CPU */
1550         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1551         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1552         Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1553
1554         /* Is dynamic pair-list pruning activated? */
1555         if (nbv->listParams->useDynamicPruning)
1556         {
1557             launchGpuRollingPruning(cr, nbv, inputrec, step);
1558         }
1559         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1560         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1561     }
1562
1563     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1564     {
1565         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1566         // in principle this should be included in the DD balancing region,
1567         // but generally it is infrequent so we'll omit it for the sake of
1568         // simpler code
1569         fr->gpuBonded->accumulateEnergyTerms(enerd);
1570         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1571
1572         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1573         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1574         fr->gpuBonded->clearEnergies();
1575         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1576         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1577     }
1578
1579     /* Do the nonbonded GPU (or emulation) force buffer reduction
1580      * on the non-alternating path. */
1581     if (bUseOrEmulGPU && !alternateGpuWait)
1582     {
1583         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1584                                        nbv->nbat, f, wcycle);
1585     }
1586     if (DOMAINDECOMP(cr))
1587     {
1588         dd_force_flop_stop(cr->dd, nrnb);
1589     }
1590
1591     if (bDoForces)
1592     {
1593         /* If we have NoVirSum forces, but we do not calculate the virial,
1594          * we sum fr->f_novirsum=f later.
1595          */
1596         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1597         {
1598             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1599                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1600         }
1601
1602         if (flags & GMX_FORCE_VIRIAL)
1603         {
1604             /* Calculation of the virial must be done after vsites! */
1605             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1606                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1607         }
1608     }
1609
1610     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1611     {
1612         /* In case of node-splitting, the PP nodes receive the long-range
1613          * forces, virial and energy from the PME nodes here.
1614          */
1615         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1616     }
1617
1618     if (bDoForces)
1619     {
1620         post_process_forces(cr, step, nrnb, wcycle,
1621                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1622                             vir_force, mdatoms, graph, fr, vsite,
1623                             flags);
1624     }
1625
1626     if (flags & GMX_FORCE_ENERGY)
1627     {
1628         /* Sum the potential energy terms from group contributions */
1629         sum_epot(&(enerd->grpp), enerd->term);
1630
1631         if (!EI_TPI(inputrec->eI))
1632         {
1633             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1634         }
1635     }
1636 }
1637
1638 static void do_force_cutsGROUP(FILE *fplog,
1639                                const t_commrec *cr,
1640                                const gmx_multisim_t *ms,
1641                                const t_inputrec *inputrec,
1642                                gmx::Awh *awh,
1643                                gmx_enfrot *enforcedRotation,
1644                                int64_t step,
1645                                t_nrnb *nrnb,
1646                                gmx_wallcycle_t wcycle,
1647                                gmx_localtop_t *top,
1648                                const gmx_groups_t *groups,
1649                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1650                                history_t *hist,
1651                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1652                                tensor vir_force,
1653                                const t_mdatoms *mdatoms,
1654                                gmx_enerdata_t *enerd,
1655                                t_fcdata *fcd,
1656                                real *lambda,
1657                                t_graph *graph,
1658                                t_forcerec *fr,
1659                                const gmx_vsite_t *vsite,
1660                                rvec mu_tot,
1661                                double t,
1662                                gmx_edsam *ed,
1663                                int flags,
1664                                const DDBalanceRegionHandler &ddBalanceRegionHandler)
1665 {
1666     int        cg0, cg1, i, j;
1667     double     mu[2*DIM];
1668     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1669     gmx_bool   bDoForces;
1670     float      cycles_pme;
1671
1672     const int  start  = 0;
1673     const int  homenr = mdatoms->homenr;
1674
1675     clear_mat(vir_force);
1676
1677     cg0 = 0;
1678     if (DOMAINDECOMP(cr))
1679     {
1680         cg1 = cr->dd->globalAtomGroupIndices.size();
1681     }
1682     else
1683     {
1684         cg1 = top->cgs.nr;
1685     }
1686     if (fr->n_tpi > 0)
1687     {
1688         cg1--;
1689     }
1690
1691     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1692     bNS            = ((flags & GMX_FORCE_NS) != 0);
1693     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1694     bFillGrid      = (bNS && bStateChanged);
1695     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1696     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1697
1698     if (bStateChanged)
1699     {
1700         update_forcerec(fr, box);
1701
1702         if (inputrecNeedMutot(inputrec))
1703         {
1704             /* Calculate total (local) dipole moment in a temporary common array.
1705              * This makes it possible to sum them over nodes faster.
1706              */
1707             calc_mu(start, homenr,
1708                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1709                     mu, mu+DIM);
1710         }
1711     }
1712
1713     if (fr->ePBC != epbcNONE)
1714     {
1715         /* Compute shift vectors every step,
1716          * because of pressure coupling or box deformation!
1717          */
1718         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1719         {
1720             calc_shifts(box, fr->shift_vec);
1721         }
1722
1723         if (bCalcCGCM)
1724         {
1725             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1726                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1727             inc_nrnb(nrnb, eNR_CGCM, homenr);
1728             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1729         }
1730         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1731         {
1732             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1733         }
1734     }
1735     else if (bCalcCGCM)
1736     {
1737         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1738         inc_nrnb(nrnb, eNR_CGCM, homenr);
1739     }
1740
1741     if (bCalcCGCM && gmx_debug_at)
1742     {
1743         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1744     }
1745
1746 #if GMX_MPI
1747     if (!thisRankHasDuty(cr, DUTY_PME))
1748     {
1749         /* Send particle coordinates to the pme nodes.
1750          * Since this is only implemented for domain decomposition
1751          * and domain decomposition does not use the graph,
1752          * we do not need to worry about shifting.
1753          */
1754         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1755                                  lambda[efptCOUL], lambda[efptVDW],
1756                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1757                                  step, wcycle);
1758     }
1759 #endif /* GMX_MPI */
1760
1761     /* Communicate coordinates and sum dipole if necessary */
1762     if (DOMAINDECOMP(cr))
1763     {
1764         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1765
1766         /* No GPU support, no move_x overlap, so reopen the balance region here */
1767         ddBalanceRegionHandler.reopenRegionCpu();
1768     }
1769
1770     if (inputrecNeedMutot(inputrec))
1771     {
1772         if (bStateChanged)
1773         {
1774             if (PAR(cr))
1775             {
1776                 gmx_sumd(2*DIM, mu, cr);
1777
1778                 ddBalanceRegionHandler.reopenRegionCpu();
1779             }
1780             for (i = 0; i < 2; i++)
1781             {
1782                 for (j = 0; j < DIM; j++)
1783                 {
1784                     fr->mu_tot[i][j] = mu[i*DIM + j];
1785                 }
1786             }
1787         }
1788         if (fr->efep == efepNO)
1789         {
1790             copy_rvec(fr->mu_tot[0], mu_tot);
1791         }
1792         else
1793         {
1794             for (j = 0; j < DIM; j++)
1795             {
1796                 mu_tot[j] =
1797                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1798             }
1799         }
1800     }
1801
1802     /* Reset energies */
1803     reset_enerdata(enerd);
1804     clear_rvecs(SHIFTS, fr->fshift);
1805
1806     if (bNS)
1807     {
1808         wallcycle_start(wcycle, ewcNS);
1809
1810         if (graph && bStateChanged)
1811         {
1812             /* Calculate intramolecular shift vectors to make molecules whole */
1813             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1814         }
1815
1816         /* Do the actual neighbour searching */
1817         ns(fplog, fr, box,
1818            groups, top, mdatoms,
1819            cr, nrnb, bFillGrid);
1820
1821         wallcycle_stop(wcycle, ewcNS);
1822     }
1823
1824     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1825     {
1826         wallcycle_start(wcycle, ewcPPDURINGPME);
1827         dd_force_flop_start(cr->dd, nrnb);
1828     }
1829
1830     if (inputrec->bRot)
1831     {
1832         wallcycle_start(wcycle, ewcROT);
1833         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1834         wallcycle_stop(wcycle, ewcROT);
1835     }
1836
1837     /* Temporary solution until all routines take PaddedRVecVector */
1838     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1839
1840     /* Start the force cycle counter.
1841      * Note that a different counter is used for dynamic load balancing.
1842      */
1843     wallcycle_start(wcycle, ewcFORCE);
1844
1845     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1846     if (bDoForces)
1847     {
1848         /* If we need to compute the virial, we might need a separate
1849          * force buffer for algorithms for which the virial is calculated
1850          * separately, such as PME.
1851          */
1852         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1853         {
1854             forceRef = *fr->forceBufferForDirectVirialContributions;
1855
1856             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1857         }
1858
1859         /* Clear the short- and long-range forces */
1860         clear_rvecs(fr->natoms_force_constr, f);
1861     }
1862
1863     /* forceWithVirial might need the full force atom range */
1864     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1865
1866     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1867     {
1868         clear_pull_forces(inputrec->pull_work);
1869     }
1870
1871     /* update QMMMrec, if necessary */
1872     if (fr->bQMMM)
1873     {
1874         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1875     }
1876
1877     /* Compute the bonded and non-bonded energies and optionally forces */
1878     do_force_lowlevel(fr, inputrec, &(top->idef),
1879                       cr, ms, nrnb, wcycle, mdatoms,
1880                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1881                       box, inputrec->fepvals, lambda,
1882                       graph, &(top->excls), fr->mu_tot,
1883                       flags,
1884                       &cycles_pme, ddBalanceRegionHandler);
1885
1886     wallcycle_stop(wcycle, ewcFORCE);
1887
1888     if (DOMAINDECOMP(cr))
1889     {
1890         dd_force_flop_stop(cr->dd, nrnb);
1891
1892         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1893     }
1894
1895     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1896                          step, t, wcycle,
1897                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1898                          flags, &forceWithVirial, enerd,
1899                          ed, bNS);
1900
1901     if (bDoForces)
1902     {
1903         /* Communicate the forces */
1904         if (DOMAINDECOMP(cr))
1905         {
1906             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1907             /* Do we need to communicate the separate force array
1908              * for terms that do not contribute to the single sum virial?
1909              * Position restraints and electric fields do not introduce
1910              * inter-cg forces, only full electrostatics methods do.
1911              * When we do not calculate the virial, fr->f_novirsum = f,
1912              * so we have already communicated these forces.
1913              */
1914             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1915                 (flags & GMX_FORCE_VIRIAL))
1916             {
1917                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1918             }
1919         }
1920
1921         /* If we have NoVirSum forces, but we do not calculate the virial,
1922          * we sum fr->f_novirsum=f later.
1923          */
1924         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1925         {
1926             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1927                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1928         }
1929
1930         if (flags & GMX_FORCE_VIRIAL)
1931         {
1932             /* Calculation of the virial must be done after vsites! */
1933             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1934                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1935         }
1936     }
1937
1938     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1939     {
1940         /* In case of node-splitting, the PP nodes receive the long-range
1941          * forces, virial and energy from the PME nodes here.
1942          */
1943         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1944     }
1945
1946     if (bDoForces)
1947     {
1948         post_process_forces(cr, step, nrnb, wcycle,
1949                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1950                             vir_force, mdatoms, graph, fr, vsite,
1951                             flags);
1952     }
1953
1954     if (flags & GMX_FORCE_ENERGY)
1955     {
1956         /* Sum the potential energy terms from group contributions */
1957         sum_epot(&(enerd->grpp), enerd->term);
1958
1959         if (!EI_TPI(inputrec->eI))
1960         {
1961             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1962         }
1963     }
1964
1965 }
1966
1967 void do_force(FILE                                     *fplog,
1968               const t_commrec                          *cr,
1969               const gmx_multisim_t                     *ms,
1970               const t_inputrec                         *inputrec,
1971               gmx::Awh                                 *awh,
1972               gmx_enfrot                               *enforcedRotation,
1973               int64_t                                   step,
1974               t_nrnb                                   *nrnb,
1975               gmx_wallcycle_t                           wcycle,
1976               gmx_localtop_t                           *top,
1977               const gmx_groups_t                       *groups,
1978               matrix                                    box,
1979               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
1980               history_t                                *hist,
1981               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
1982               tensor                                    vir_force,
1983               const t_mdatoms                          *mdatoms,
1984               gmx_enerdata_t                           *enerd,
1985               t_fcdata                                 *fcd,
1986               gmx::ArrayRef<real>                       lambda,
1987               t_graph                                  *graph,
1988               t_forcerec                               *fr,
1989               gmx::PpForceWorkload                     *ppForceWorkload,
1990               const gmx_vsite_t                        *vsite,
1991               rvec                                      mu_tot,
1992               double                                    t,
1993               gmx_edsam                                *ed,
1994               int                                       flags,
1995               const DDBalanceRegionHandler             &ddBalanceRegionHandler)
1996 {
1997     /* modify force flag if not doing nonbonded */
1998     if (!fr->bNonbonded)
1999     {
2000         flags &= ~GMX_FORCE_NONBONDED;
2001     }
2002
2003     switch (inputrec->cutoff_scheme)
2004     {
2005         case ecutsVERLET:
2006             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2007                                 awh, enforcedRotation, step, nrnb, wcycle,
2008                                 top,
2009                                 groups,
2010                                 box, x, hist,
2011                                 force, vir_force,
2012                                 mdatoms,
2013                                 enerd, fcd,
2014                                 lambda.data(), graph,
2015                                 fr,
2016                                 ppForceWorkload,
2017                                 fr->ic,
2018                                 vsite, mu_tot,
2019                                 t, ed,
2020                                 flags,
2021                                 ddBalanceRegionHandler);
2022             break;
2023         case ecutsGROUP:
2024             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2025                                awh, enforcedRotation, step, nrnb, wcycle,
2026                                top,
2027                                groups,
2028                                box, x, hist,
2029                                force, vir_force,
2030                                mdatoms,
2031                                enerd, fcd,
2032                                lambda.data(), graph,
2033                                fr, vsite, mu_tot,
2034                                t, ed,
2035                                flags,
2036                                ddBalanceRegionHandler);
2037             break;
2038         default:
2039             gmx_incons("Invalid cut-off scheme passed!");
2040     }
2041
2042     /* In case we don't have constraints and are using GPUs, the next balancing
2043      * region starts here.
2044      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2045      * virial calculation and COM pulling, is not thus not included in
2046      * the balance timing, which is ok as most tasks do communication.
2047      */
2048     ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2049 }
2050
2051
2052 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2053                         const t_inputrec *ir, const t_mdatoms *md,
2054                         t_state *state)
2055 {
2056     int             i, m, start, end;
2057     int64_t         step;
2058     real            dt = ir->delta_t;
2059     real            dvdl_dum;
2060     rvec           *savex;
2061
2062     /* We need to allocate one element extra, since we might use
2063      * (unaligned) 4-wide SIMD loads to access rvec entries.
2064      */
2065     snew(savex, state->natoms + 1);
2066
2067     start = 0;
2068     end   = md->homenr;
2069
2070     if (debug)
2071     {
2072         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2073                 start, md->homenr, end);
2074     }
2075     /* Do a first constrain to reset particles... */
2076     step = ir->init_step;
2077     if (fplog)
2078     {
2079         char buf[STEPSTRSIZE];
2080         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2081                 gmx_step_str(step, buf));
2082     }
2083     dvdl_dum = 0;
2084
2085     /* constrain the current position */
2086     constr->apply(TRUE, FALSE,
2087                   step, 0, 1.0,
2088                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2089                   state->box,
2090                   state->lambda[efptBONDED], &dvdl_dum,
2091                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2092     if (EI_VV(ir->eI))
2093     {
2094         /* constrain the inital velocity, and save it */
2095         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2096         constr->apply(TRUE, FALSE,
2097                       step, 0, 1.0,
2098                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2099                       state->box,
2100                       state->lambda[efptBONDED], &dvdl_dum,
2101                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2102     }
2103     /* constrain the inital velocities at t-dt/2 */
2104     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2105     {
2106         auto x = makeArrayRef(state->x).subArray(start, end);
2107         auto v = makeArrayRef(state->v).subArray(start, end);
2108         for (i = start; (i < end); i++)
2109         {
2110             for (m = 0; (m < DIM); m++)
2111             {
2112                 /* Reverse the velocity */
2113                 v[i][m] = -v[i][m];
2114                 /* Store the position at t-dt in buf */
2115                 savex[i][m] = x[i][m] + dt*v[i][m];
2116             }
2117         }
2118         /* Shake the positions at t=-dt with the positions at t=0
2119          * as reference coordinates.
2120          */
2121         if (fplog)
2122         {
2123             char buf[STEPSTRSIZE];
2124             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2125                     gmx_step_str(step, buf));
2126         }
2127         dvdl_dum = 0;
2128         constr->apply(TRUE, FALSE,
2129                       step, -1, 1.0,
2130                       state->x.rvec_array(), savex, nullptr,
2131                       state->box,
2132                       state->lambda[efptBONDED], &dvdl_dum,
2133                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2134
2135         for (i = start; i < end; i++)
2136         {
2137             for (m = 0; m < DIM; m++)
2138             {
2139                 /* Re-reverse the velocities */
2140                 v[i][m] = -v[i][m];
2141             }
2142         }
2143     }
2144     sfree(savex);
2145 }
2146
2147
2148 static void
2149 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2150                 double *enerout, double *virout)
2151 {
2152     double enersum, virsum;
2153     double invscale, invscale2, invscale3;
2154     double r, ea, eb, ec, pa, pb, pc, pd;
2155     double y0, f, g, h;
2156     int    ri, offset;
2157     double tabfactor;
2158
2159     invscale  = 1.0/scale;
2160     invscale2 = invscale*invscale;
2161     invscale3 = invscale*invscale2;
2162
2163     /* Following summation derived from cubic spline definition,
2164      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2165      * the cubic spline.  We first calculate the negative of the
2166      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2167      * add the more standard, abrupt cutoff correction to that result,
2168      * yielding the long-range correction for a switched function.  We
2169      * perform both the pressure and energy loops at the same time for
2170      * simplicity, as the computational cost is low. */
2171
2172     if (offstart == 0)
2173     {
2174         /* Since the dispersion table has been scaled down a factor
2175          * 6.0 and the repulsion a factor 12.0 to compensate for the
2176          * c6/c12 parameters inside nbfp[] being scaled up (to save
2177          * flops in kernels), we need to correct for this.
2178          */
2179         tabfactor = 6.0;
2180     }
2181     else
2182     {
2183         tabfactor = 12.0;
2184     }
2185
2186     enersum = 0.0;
2187     virsum  = 0.0;
2188     for (ri = rstart; ri < rend; ++ri)
2189     {
2190         r  = ri*invscale;
2191         ea = invscale3;
2192         eb = 2.0*invscale2*r;
2193         ec = invscale*r*r;
2194
2195         pa = invscale3;
2196         pb = 3.0*invscale2*r;
2197         pc = 3.0*invscale*r*r;
2198         pd = r*r*r;
2199
2200         /* this "8" is from the packing in the vdwtab array - perhaps
2201            should be defined? */
2202
2203         offset = 8*ri + offstart;
2204         y0     = vdwtab[offset];
2205         f      = vdwtab[offset+1];
2206         g      = vdwtab[offset+2];
2207         h      = vdwtab[offset+3];
2208
2209         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2210         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2211     }
2212     *enerout = 4.0*M_PI*enersum*tabfactor;
2213     *virout  = 4.0*M_PI*virsum*tabfactor;
2214 }
2215
2216 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2217 {
2218     double   eners[2], virs[2], enersum, virsum;
2219     double   r0, rc3, rc9;
2220     int      ri0, ri1, i;
2221     real     scale, *vdwtab;
2222
2223     fr->enershiftsix    = 0;
2224     fr->enershifttwelve = 0;
2225     fr->enerdiffsix     = 0;
2226     fr->enerdifftwelve  = 0;
2227     fr->virdiffsix      = 0;
2228     fr->virdifftwelve   = 0;
2229
2230     const interaction_const_t *ic = fr->ic;
2231
2232     if (eDispCorr != edispcNO)
2233     {
2234         for (i = 0; i < 2; i++)
2235         {
2236             eners[i] = 0;
2237             virs[i]  = 0;
2238         }
2239         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2240             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2241             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2242             (ic->vdwtype == evdwSHIFT) ||
2243             (ic->vdwtype == evdwSWITCH))
2244         {
2245             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2246                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2247                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2248             {
2249                 gmx_fatal(FARGS,
2250                           "With dispersion correction rvdw-switch can not be zero "
2251                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2252             }
2253
2254             /* TODO This code depends on the logic in tables.c that
2255                constructs the table layout, which should be made
2256                explicit in future cleanup. */
2257             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2258                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2259             scale  = fr->dispersionCorrectionTable->scale;
2260             vdwtab = fr->dispersionCorrectionTable->data;
2261
2262             /* Round the cut-offs to exact table values for precision */
2263             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2264             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2265
2266             /* The code below has some support for handling force-switching, i.e.
2267              * when the force (instead of potential) is switched over a limited
2268              * region. This leads to a constant shift in the potential inside the
2269              * switching region, which we can handle by adding a constant energy
2270              * term in the force-switch case just like when we do potential-shift.
2271              *
2272              * For now this is not enabled, but to keep the functionality in the
2273              * code we check separately for switch and shift. When we do force-switch
2274              * the shifting point is rvdw_switch, while it is the cutoff when we
2275              * have a classical potential-shift.
2276              *
2277              * For a pure potential-shift the potential has a constant shift
2278              * all the way out to the cutoff, and that is it. For other forms
2279              * we need to calculate the constant shift up to the point where we
2280              * start modifying the potential.
2281              */
2282             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2283
2284             r0   = ri0/scale;
2285             rc3  = r0*r0*r0;
2286             rc9  = rc3*rc3*rc3;
2287
2288             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2289                 (ic->vdwtype == evdwSHIFT))
2290             {
2291                 /* Determine the constant energy shift below rvdw_switch.
2292                  * Table has a scale factor since we have scaled it down to compensate
2293                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2294                  */
2295                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2296                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2297             }
2298             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2299             {
2300                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2301                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2302             }
2303
2304             /* Add the constant part from 0 to rvdw_switch.
2305              * This integration from 0 to rvdw_switch overcounts the number
2306              * of interactions by 1, as it also counts the self interaction.
2307              * We will correct for this later.
2308              */
2309             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2310             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2311
2312             /* Calculate the contribution in the range [r0,r1] where we
2313              * modify the potential. For a pure potential-shift modifier we will
2314              * have ri0==ri1, and there will not be any contribution here.
2315              */
2316             for (i = 0; i < 2; i++)
2317             {
2318                 enersum = 0;
2319                 virsum  = 0;
2320                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2321                 eners[i] -= enersum;
2322                 virs[i]  -= virsum;
2323             }
2324
2325             /* Alright: Above we compensated by REMOVING the parts outside r0
2326              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2327              *
2328              * Regardless of whether r0 is the point where we start switching,
2329              * or the cutoff where we calculated the constant shift, we include
2330              * all the parts we are missing out to infinity from r0 by
2331              * calculating the analytical dispersion correction.
2332              */
2333             eners[0] += -4.0*M_PI/(3.0*rc3);
2334             eners[1] +=  4.0*M_PI/(9.0*rc9);
2335             virs[0]  +=  8.0*M_PI/rc3;
2336             virs[1]  += -16.0*M_PI/(3.0*rc9);
2337         }
2338         else if (ic->vdwtype == evdwCUT ||
2339                  EVDW_PME(ic->vdwtype) ||
2340                  ic->vdwtype == evdwUSER)
2341         {
2342             if (ic->vdwtype == evdwUSER && fplog)
2343             {
2344                 fprintf(fplog,
2345                         "WARNING: using dispersion correction with user tables\n");
2346             }
2347
2348             /* Note that with LJ-PME, the dispersion correction is multiplied
2349              * by the difference between the actual C6 and the value of C6
2350              * that would produce the combination rule.
2351              * This means the normal energy and virial difference formulas
2352              * can be used here.
2353              */
2354
2355             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2356             rc9  = rc3*rc3*rc3;
2357             /* Contribution beyond the cut-off */
2358             eners[0] += -4.0*M_PI/(3.0*rc3);
2359             eners[1] +=  4.0*M_PI/(9.0*rc9);
2360             if (ic->vdw_modifier == eintmodPOTSHIFT)
2361             {
2362                 /* Contribution within the cut-off */
2363                 eners[0] += -4.0*M_PI/(3.0*rc3);
2364                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2365             }
2366             /* Contribution beyond the cut-off */
2367             virs[0]  +=  8.0*M_PI/rc3;
2368             virs[1]  += -16.0*M_PI/(3.0*rc9);
2369         }
2370         else
2371         {
2372             gmx_fatal(FARGS,
2373                       "Dispersion correction is not implemented for vdw-type = %s",
2374                       evdw_names[ic->vdwtype]);
2375         }
2376
2377         /* When we deprecate the group kernels the code below can go too */
2378         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2379         {
2380             /* Calculate self-interaction coefficient (assuming that
2381              * the reciprocal-space contribution is constant in the
2382              * region that contributes to the self-interaction).
2383              */
2384             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2385
2386             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2387             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2388         }
2389
2390         fr->enerdiffsix    = eners[0];
2391         fr->enerdifftwelve = eners[1];
2392         /* The 0.5 is due to the Gromacs definition of the virial */
2393         fr->virdiffsix     = 0.5*virs[0];
2394         fr->virdifftwelve  = 0.5*virs[1];
2395     }
2396 }
2397
2398 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2399                    const matrix box, real lambda, tensor pres, tensor virial,
2400                    real *prescorr, real *enercorr, real *dvdlcorr)
2401 {
2402     gmx_bool bCorrAll, bCorrPres;
2403     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2404     int      m;
2405
2406     *prescorr = 0;
2407     *enercorr = 0;
2408     *dvdlcorr = 0;
2409
2410     clear_mat(virial);
2411     clear_mat(pres);
2412
2413     if (ir->eDispCorr != edispcNO)
2414     {
2415         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2416                      ir->eDispCorr == edispcAllEnerPres);
2417         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2418                      ir->eDispCorr == edispcAllEnerPres);
2419
2420         invvol = 1/det(box);
2421         if (fr->n_tpi)
2422         {
2423             /* Only correct for the interactions with the inserted molecule */
2424             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2425             ninter = fr->n_tpi;
2426         }
2427         else
2428         {
2429             dens   = fr->numAtomsForDispersionCorrection*invvol;
2430             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2431         }
2432
2433         if (ir->efep == efepNO)
2434         {
2435             avcsix    = fr->avcsix[0];
2436             avctwelve = fr->avctwelve[0];
2437         }
2438         else
2439         {
2440             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2441             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2442         }
2443
2444         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2445         *enercorr += avcsix*enerdiff;
2446         dvdlambda  = 0.0;
2447         if (ir->efep != efepNO)
2448         {
2449             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2450         }
2451         if (bCorrAll)
2452         {
2453             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2454             *enercorr += avctwelve*enerdiff;
2455             if (fr->efep != efepNO)
2456             {
2457                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2458             }
2459         }
2460
2461         if (bCorrPres)
2462         {
2463             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2464             if (ir->eDispCorr == edispcAllEnerPres)
2465             {
2466                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2467             }
2468             /* The factor 2 is because of the Gromacs virial definition */
2469             spres = -2.0*invvol*svir*PRESFAC;
2470
2471             for (m = 0; m < DIM; m++)
2472             {
2473                 virial[m][m] += svir;
2474                 pres[m][m]   += spres;
2475             }
2476             *prescorr += spres;
2477         }
2478
2479         /* Can't currently control when it prints, for now, just print when degugging */
2480         if (debug)
2481         {
2482             if (bCorrAll)
2483             {
2484                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2485                         avcsix, avctwelve);
2486             }
2487             if (bCorrPres)
2488             {
2489                 fprintf(debug,
2490                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2491                         *enercorr, spres, svir);
2492             }
2493             else
2494             {
2495                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2496             }
2497         }
2498
2499         if (fr->efep != efepNO)
2500         {
2501             *dvdlcorr += dvdlambda;
2502         }
2503     }
2504 }
2505
2506 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2507                             const gmx_mtop_t *mtop, rvec x[],
2508                             gmx_bool bFirst)
2509 {
2510     t_graph        *graph;
2511     int             as, mol;
2512
2513     if (bFirst && fplog)
2514     {
2515         fprintf(fplog, "Removing pbc first time\n");
2516     }
2517
2518     snew(graph, 1);
2519     as = 0;
2520     for (const gmx_molblock_t &molb : mtop->molblock)
2521     {
2522         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2523         if (moltype.atoms.nr == 1 ||
2524             (!bFirst && moltype.cgs.nr == 1))
2525         {
2526             /* Just one atom or charge group in the molecule, no PBC required */
2527             as += molb.nmol*moltype.atoms.nr;
2528         }
2529         else
2530         {
2531             mk_graph_moltype(moltype, graph);
2532
2533             for (mol = 0; mol < molb.nmol; mol++)
2534             {
2535                 mk_mshift(fplog, graph, ePBC, box, x+as);
2536
2537                 shift_self(graph, box, x+as);
2538                 /* The molecule is whole now.
2539                  * We don't need the second mk_mshift call as in do_pbc_first,
2540                  * since we no longer need this graph.
2541                  */
2542
2543                 as += moltype.atoms.nr;
2544             }
2545             done_graph(graph);
2546         }
2547     }
2548     sfree(graph);
2549 }
2550
2551 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2552                        const gmx_mtop_t *mtop, rvec x[])
2553 {
2554     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2555 }
2556
2557 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2558                  const gmx_mtop_t *mtop, rvec x[])
2559 {
2560     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2561 }
2562
2563 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2564 {
2565     int t, nth;
2566     nth = gmx_omp_nthreads_get(emntDefault);
2567
2568 #pragma omp parallel for num_threads(nth) schedule(static)
2569     for (t = 0; t < nth; t++)
2570     {
2571         try
2572         {
2573             size_t natoms = x.size();
2574             size_t offset = (natoms*t    )/nth;
2575             size_t len    = (natoms*(t + 1))/nth - offset;
2576             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2577         }
2578         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2579     }
2580 }
2581
2582 // TODO This can be cleaned up a lot, and move back to runner.cpp
2583 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2584                 const t_inputrec *inputrec,
2585                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2586                 gmx_walltime_accounting_t walltime_accounting,
2587                 nonbonded_verlet_t *nbv,
2588                 const gmx_pme_t *pme,
2589                 gmx_bool bWriteStat)
2590 {
2591     t_nrnb *nrnb_tot = nullptr;
2592     double  delta_t  = 0;
2593     double  nbfs     = 0, mflop = 0;
2594     double  elapsed_time,
2595             elapsed_time_over_all_ranks,
2596             elapsed_time_over_all_threads,
2597             elapsed_time_over_all_threads_over_all_ranks;
2598     /* Control whether it is valid to print a report. Only the
2599        simulation master may print, but it should not do so if the run
2600        terminated e.g. before a scheduled reset step. This is
2601        complicated by the fact that PME ranks are unaware of the
2602        reason why they were sent a pmerecvqxFINISH. To avoid
2603        communication deadlocks, we always do the communication for the
2604        report, even if we've decided not to write the report, because
2605        how long it takes to finish the run is not important when we've
2606        decided not to report on the simulation performance.
2607
2608        Further, we only report performance for dynamical integrators,
2609        because those are the only ones for which we plan to
2610        consider doing any optimizations. */
2611     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2612
2613     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2614     {
2615         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2616         printReport = false;
2617     }
2618
2619     if (cr->nnodes > 1)
2620     {
2621         snew(nrnb_tot, 1);
2622 #if GMX_MPI
2623         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2624                       cr->mpi_comm_mysim);
2625 #endif
2626     }
2627     else
2628     {
2629         nrnb_tot = nrnb;
2630     }
2631
2632     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2633     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2634     if (cr->nnodes > 1)
2635     {
2636 #if GMX_MPI
2637         /* reduce elapsed_time over all MPI ranks in the current simulation */
2638         MPI_Allreduce(&elapsed_time,
2639                       &elapsed_time_over_all_ranks,
2640                       1, MPI_DOUBLE, MPI_SUM,
2641                       cr->mpi_comm_mysim);
2642         elapsed_time_over_all_ranks /= cr->nnodes;
2643         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2644          * current simulation. */
2645         MPI_Allreduce(&elapsed_time_over_all_threads,
2646                       &elapsed_time_over_all_threads_over_all_ranks,
2647                       1, MPI_DOUBLE, MPI_SUM,
2648                       cr->mpi_comm_mysim);
2649 #endif
2650     }
2651     else
2652     {
2653         elapsed_time_over_all_ranks                  = elapsed_time;
2654         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2655     }
2656
2657     if (printReport)
2658     {
2659         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2660     }
2661     if (cr->nnodes > 1)
2662     {
2663         sfree(nrnb_tot);
2664     }
2665
2666     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2667     {
2668         print_dd_statistics(cr, inputrec, fplog);
2669     }
2670
2671     /* TODO Move the responsibility for any scaling by thread counts
2672      * to the code that handled the thread region, so that there's a
2673      * mechanism to keep cycle counting working during the transition
2674      * to task parallelism. */
2675     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2676     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2677     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2678     auto cycle_sum(wallcycle_sum(cr, wcycle));
2679
2680     if (printReport)
2681     {
2682         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2683         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2684         if (pme_gpu_task_enabled(pme))
2685         {
2686             pme_gpu_get_timings(pme, &pme_gpu_timings);
2687         }
2688         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2689                         elapsed_time_over_all_ranks,
2690                         wcycle, cycle_sum,
2691                         nbnxn_gpu_timings,
2692                         &pme_gpu_timings);
2693
2694         if (EI_DYNAMICS(inputrec->eI))
2695         {
2696             delta_t = inputrec->delta_t;
2697         }
2698
2699         if (fplog)
2700         {
2701             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2702                        elapsed_time_over_all_ranks,
2703                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2704                        delta_t, nbfs, mflop);
2705         }
2706         if (bWriteStat)
2707         {
2708             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2709                        elapsed_time_over_all_ranks,
2710                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2711                        delta_t, nbfs, mflop);
2712         }
2713     }
2714 }
2715
2716 void initialize_lambdas(FILE               *fplog,
2717                         const t_inputrec   &ir,
2718                         bool                isMaster,
2719                         int                *fep_state,
2720                         gmx::ArrayRef<real> lambda,
2721                         double             *lam0)
2722 {
2723     /* TODO: Clean up initialization of fep_state and lambda in
2724        t_state.  This function works, but could probably use a logic
2725        rewrite to keep all the different types of efep straight. */
2726
2727     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2728     {
2729         return;
2730     }
2731
2732     const t_lambda *fep = ir.fepvals;
2733     if (isMaster)
2734     {
2735         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2736                                              if checkpoint is set -- a kludge is in for now
2737                                              to prevent this.*/
2738     }
2739
2740     for (int i = 0; i < efptNR; i++)
2741     {
2742         double thisLambda;
2743         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2744         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2745         {
2746             thisLambda = fep->init_lambda;
2747         }
2748         else
2749         {
2750             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2751         }
2752         if (isMaster)
2753         {
2754             lambda[i] = thisLambda;
2755         }
2756         if (lam0 != nullptr)
2757         {
2758             lam0[i] = thisLambda;
2759         }
2760     }
2761     if (ir.bSimTemp)
2762     {
2763         /* need to rescale control temperatures to match current state */
2764         for (int i = 0; i < ir.opts.ngtc; i++)
2765         {
2766             if (ir.opts.ref_t[i] > 0)
2767             {
2768                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2769             }
2770         }
2771     }
2772
2773     /* Send to the log the information on the current lambdas */
2774     if (fplog != nullptr)
2775     {
2776         fprintf(fplog, "Initial vector of lambda components:[ ");
2777         for (const auto &l : lambda)
2778         {
2779             fprintf(fplog, "%10.4f ", l);
2780         }
2781         fprintf(fplog, "]\n");
2782     }
2783 }