Merge "Merge branch 'release-2019' into master"
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
47
48 #include <array>
49
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
117
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
123
124
125 void print_time(FILE                     *out,
126                 gmx_walltime_accounting_t walltime_accounting,
127                 int64_t                   step,
128                 const t_inputrec         *ir,
129                 const t_commrec          *cr)
130 {
131     time_t finish;
132     double dt, elapsed_seconds, time_per_step;
133
134 #if !GMX_THREAD_MPI
135     if (!PAR(cr))
136 #endif
137     {
138         fprintf(out, "\r");
139     }
140     fputs("step ", out);
141     fputs(gmx::int64ToString(step).c_str(), out);
142     fflush(out);
143
144     if ((step >= ir->nstlist))
145     {
146         double seconds_since_epoch = gmx_gettime();
147         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
149         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
150
151         if (ir->nsteps >= 0)
152         {
153             if (dt >= 300)
154             {
155                 finish = static_cast<time_t>(seconds_since_epoch + dt);
156                 auto timebuf = gmx_ctime_r(&finish);
157                 timebuf.erase(timebuf.find_first_of('\n'));
158                 fputs(", will finish ", out);
159                 fputs(timebuf.c_str(), out);
160             }
161             else
162             {
163                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
164             }
165         }
166         else
167         {
168             fprintf(out, " performance: %.1f ns/day    ",
169                     ir->delta_t/1000*24*60*60/time_per_step);
170         }
171     }
172 #if !GMX_THREAD_MPI
173     if (PAR(cr))
174     {
175         fprintf(out, "\n");
176     }
177 #else
178     GMX_UNUSED_VALUE(cr);
179 #endif
180
181     fflush(out);
182 }
183
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185                          double the_time)
186 {
187     if (!fplog)
188     {
189         return;
190     }
191
192     time_t temp_time = static_cast<time_t>(the_time);
193
194     auto   timebuf = gmx_ctime_r(&temp_time);
195
196     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
197 }
198
199 void print_start(FILE *fplog, const t_commrec *cr,
200                  gmx_walltime_accounting_t walltime_accounting,
201                  const char *name)
202 {
203     char buf[STRLEN];
204
205     sprintf(buf, "Started %s", name);
206     print_date_and_time(fplog, cr->nodeid, buf,
207                         walltime_accounting_get_start_time_stamp(walltime_accounting));
208 }
209
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
211 {
212     const int      end = forceToAdd.size();
213
214     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216     for (int i = 0; i < end; i++)
217     {
218         rvec_inc(f[i], forceToAdd[i]);
219     }
220 }
221
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223                         tensor vir_part, const t_graph *graph, const matrix box,
224                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
225 {
226     /* The short-range virial from surrounding boxes */
227     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
229
230     /* Calculate partial virial, for local atoms only, based on short range.
231      * Total virial is computed in global_stat, called from do_md
232      */
233     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
235
236     if (debug)
237     {
238         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
239     }
240 }
241
242 static void pull_potential_wrapper(const t_commrec *cr,
243                                    const t_inputrec *ir,
244                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245                                    gmx::ForceWithVirial *force,
246                                    const t_mdatoms *mdatoms,
247                                    gmx_enerdata_t *enerd,
248                                    const real *lambda,
249                                    double t,
250                                    gmx_wallcycle_t wcycle)
251 {
252     t_pbc  pbc;
253     real   dvdl;
254
255     /* Calculate the center of mass forces, this requires communication,
256      * which is why pull_potential is called close to other communication.
257      */
258     wallcycle_start(wcycle, ewcPULLPOT);
259     set_pbc(&pbc, ir->ePBC, box);
260     dvdl                     = 0;
261     enerd->term[F_COM_PULL] +=
262         pull_potential(ir->pull_work, mdatoms, &pbc,
263                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265     wallcycle_stop(wcycle, ewcPULLPOT);
266 }
267
268 static void pme_receive_force_ener(const t_commrec      *cr,
269                                    gmx::ForceWithVirial *forceWithVirial,
270                                    gmx_enerdata_t       *enerd,
271                                    gmx_wallcycle_t       wcycle)
272 {
273     real   e_q, e_lj, dvdl_q, dvdl_lj;
274     float  cycles_ppdpme, cycles_seppme;
275
276     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
278
279     /* In case of node-splitting, the PP nodes receive the long-range
280      * forces, virial and energy from the PME nodes here.
281      */
282     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
283     dvdl_q  = 0;
284     dvdl_lj = 0;
285     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
286                       &cycles_seppme);
287     enerd->term[F_COUL_RECIP] += e_q;
288     enerd->term[F_LJ_RECIP]   += e_lj;
289     enerd->dvdl_lin[efptCOUL] += dvdl_q;
290     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
291
292     if (wcycle)
293     {
294         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
295     }
296     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
297 }
298
299 static void print_large_forces(FILE            *fp,
300                                const t_mdatoms *md,
301                                const t_commrec *cr,
302                                int64_t          step,
303                                real             forceTolerance,
304                                const rvec      *x,
305                                const rvec      *f)
306 {
307     real           force2Tolerance = gmx::square(forceTolerance);
308     gmx::index     numNonFinite    = 0;
309     for (int i = 0; i < md->homenr; i++)
310     {
311         real force2    = norm2(f[i]);
312         bool nonFinite = !std::isfinite(force2);
313         if (force2 >= force2Tolerance || nonFinite)
314         {
315             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
316                     step,
317                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
318         }
319         if (nonFinite)
320         {
321             numNonFinite++;
322         }
323     }
324     if (numNonFinite > 0)
325     {
326         /* Note that with MPI this fatal call on one rank might interrupt
327          * the printing on other ranks. But we can only avoid that with
328          * an expensive MPI barrier that we would need at each step.
329          */
330         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
331     }
332 }
333
334 static void post_process_forces(const t_commrec           *cr,
335                                 int64_t                    step,
336                                 t_nrnb                    *nrnb,
337                                 gmx_wallcycle_t            wcycle,
338                                 const gmx_localtop_t      *top,
339                                 const matrix               box,
340                                 const rvec                 x[],
341                                 rvec                       f[],
342                                 gmx::ForceWithVirial      *forceWithVirial,
343                                 tensor                     vir_force,
344                                 const t_mdatoms           *mdatoms,
345                                 const t_graph             *graph,
346                                 const t_forcerec          *fr,
347                                 const gmx_vsite_t         *vsite,
348                                 int                        flags)
349 {
350     if (fr->haveDirectVirialContributions)
351     {
352         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
353
354         if (vsite)
355         {
356             /* Spread the mesh force on virtual sites to the other particles...
357              * This is parallellized. MPI communication is performed
358              * if the constructing atoms aren't local.
359              */
360             matrix virial = { { 0 } };
361             spread_vsite_f(vsite, x, fDirectVir, nullptr,
362                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
363                            nrnb,
364                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365             forceWithVirial->addVirialContribution(virial);
366         }
367
368         if (flags & GMX_FORCE_VIRIAL)
369         {
370             /* Now add the forces, this is local */
371             sum_forces(f, forceWithVirial->force_);
372
373             /* Add the direct virial contributions */
374             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
376
377             if (debug)
378             {
379                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
380             }
381         }
382     }
383
384     if (fr->print_force >= 0)
385     {
386         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
387     }
388 }
389
390 static void do_nb_verlet(t_forcerec                       *fr,
391                          const interaction_const_t        *ic,
392                          gmx_enerdata_t                   *enerd,
393                          const int                         flags,
394                          const Nbnxm::InteractionLocality  ilocality,
395                          const int                         clearF,
396                          const int64_t                     step,
397                          t_nrnb                           *nrnb,
398                          gmx_wallcycle_t                   wcycle)
399 {
400     if (!(flags & GMX_FORCE_NONBONDED))
401     {
402         /* skip non-bonded calculation */
403         return;
404     }
405
406     nonbonded_verlet_t       *nbv  = fr->nbv;
407     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
408
409     /* GPU kernel launch overhead is already timed separately */
410     if (fr->cutoff_scheme != ecutsVERLET)
411     {
412         gmx_incons("Invalid cut-off scheme passed!");
413     }
414
415     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
416
417     if (!bUsingGpuKernels)
418     {
419         /* When dynamic pair-list  pruning is requested, we need to prune
420          * at nstlistPrune steps.
421          */
422         if (nbv->listParams->useDynamicPruning &&
423             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
424         {
425             /* Prune the pair-list beyond fr->ic->rlistPrune using
426              * the current coordinates of the atoms.
427              */
428             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
429             NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
430             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
431         }
432
433         wallcycle_sub_start(wcycle, ewcsNONBONDED);
434     }
435
436     NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
437
438     if (!bUsingGpuKernels)
439     {
440         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
441     }
442 }
443
444 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
445                              t_forcerec           *fr,
446                              rvec                  x[],
447                              rvec                  f[],
448                              const t_mdatoms      *mdatoms,
449                              t_lambda             *fepvals,
450                              real                 *lambda,
451                              gmx_enerdata_t       *enerd,
452                              int                   flags,
453                              t_nrnb               *nrnb,
454                              gmx_wallcycle_t       wcycle)
455 {
456     int              donb_flags;
457     nb_kernel_data_t kernel_data;
458     real             lam_i[efptNR];
459     real             dvdl_nb[efptNR];
460     int              th;
461     int              i, j;
462
463     donb_flags = 0;
464     /* Add short-range interactions */
465     donb_flags |= GMX_NONBONDED_DO_SR;
466
467     /* Currently all group scheme kernels always calculate (shift-)forces */
468     if (flags & GMX_FORCE_FORCES)
469     {
470         donb_flags |= GMX_NONBONDED_DO_FORCE;
471     }
472     if (flags & GMX_FORCE_VIRIAL)
473     {
474         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
475     }
476     if (flags & GMX_FORCE_ENERGY)
477     {
478         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
479     }
480
481     kernel_data.flags  = donb_flags;
482     kernel_data.lambda = lambda;
483     kernel_data.dvdl   = dvdl_nb;
484
485     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
486     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
487
488     /* reset free energy components */
489     for (i = 0; i < efptNR; i++)
490     {
491         dvdl_nb[i]  = 0;
492     }
493
494     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
495
496     wallcycle_sub_start(wcycle, ewcsNONBONDED);
497 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
498     for (th = 0; th < nbl_lists->nnbl; th++)
499     {
500         try
501         {
502             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
503                                       x, f, fr, mdatoms, &kernel_data, nrnb);
504         }
505         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
506     }
507
508     if (fepvals->sc_alpha != 0)
509     {
510         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
511         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
512     }
513     else
514     {
515         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
516         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
517     }
518
519     /* If we do foreign lambda and we have soft-core interactions
520      * we have to recalculate the (non-linear) energies contributions.
521      */
522     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
523     {
524         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
525         kernel_data.lambda         = lam_i;
526         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
527         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
528         /* Note that we add to kernel_data.dvdl, but ignore the result */
529
530         for (i = 0; i < enerd->n_lambda; i++)
531         {
532             for (j = 0; j < efptNR; j++)
533             {
534                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
535             }
536             reset_foreign_enerdata(enerd);
537 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
538             for (th = 0; th < nbl_lists->nnbl; th++)
539             {
540                 try
541                 {
542                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
543                                               x, f, fr, mdatoms, &kernel_data, nrnb);
544                 }
545                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
546             }
547
548             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
549             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
550         }
551     }
552
553     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
554 }
555
556 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
557 {
558     return nbv != nullptr && nbv->bUseGPU;
559 }
560
561 static inline void clear_rvecs_omp(int n, rvec v[])
562 {
563     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
564
565     /* Note that we would like to avoid this conditional by putting it
566      * into the omp pragma instead, but then we still take the full
567      * omp parallel for overhead (at least with gcc5).
568      */
569     if (nth == 1)
570     {
571         for (int i = 0; i < n; i++)
572         {
573             clear_rvec(v[i]);
574         }
575     }
576     else
577     {
578 #pragma omp parallel for num_threads(nth) schedule(static)
579         for (int i = 0; i < n; i++)
580         {
581             clear_rvec(v[i]);
582         }
583     }
584 }
585
586 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
587  *
588  * \param groupOptions  Group options, containing T-coupling options
589  */
590 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
591 {
592     real nrdfCoupled   = 0;
593     real nrdfUncoupled = 0;
594     real kineticEnergy = 0;
595     for (int g = 0; g < groupOptions.ngtc; g++)
596     {
597         if (groupOptions.tau_t[g] >= 0)
598         {
599             nrdfCoupled   += groupOptions.nrdf[g];
600             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
601         }
602         else
603         {
604             nrdfUncoupled += groupOptions.nrdf[g];
605         }
606     }
607
608     /* This conditional with > also catches nrdf=0 */
609     if (nrdfCoupled > nrdfUncoupled)
610     {
611         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
612     }
613     else
614     {
615         return 0;
616     }
617 }
618
619 /*! \brief This routine checks that the potential energy is finite.
620  *
621  * Always checks that the potential energy is finite. If step equals
622  * inputrec.init_step also checks that the magnitude of the potential energy
623  * is reasonable. Terminates with a fatal error when a check fails.
624  * Note that passing this check does not guarantee finite forces,
625  * since those use slightly different arithmetics. But in most cases
626  * there is just a narrow coordinate range where forces are not finite
627  * and energies are finite.
628  *
629  * \param[in] step      The step number, used for checking and printing
630  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
631  * \param[in] inputrec  The input record
632  */
633 static void checkPotentialEnergyValidity(int64_t               step,
634                                          const gmx_enerdata_t &enerd,
635                                          const t_inputrec     &inputrec)
636 {
637     /* Threshold valid for comparing absolute potential energy against
638      * the kinetic energy. Normally one should not consider absolute
639      * potential energy values, but with a factor of one million
640      * we should never get false positives.
641      */
642     constexpr real c_thresholdFactor = 1e6;
643
644     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
645     real           averageKineticEnergy = 0;
646     /* We only check for large potential energy at the initial step,
647      * because that is by far the most likely step for this too occur
648      * and because computing the average kinetic energy is not free.
649      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
650      * before they become NaN.
651      */
652     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
653     {
654         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
655     }
656
657     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
658                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
659     {
660         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
661                   step,
662                   enerd.term[F_EPOT],
663                   energyIsNotFinite ? "not finite" : "extremely high",
664                   enerd.term[F_LJ],
665                   enerd.term[F_COUL_SR],
666                   energyIsNotFinite ? "non-finite" : "very high",
667                   energyIsNotFinite ? " or Nan" : "");
668     }
669 }
670
671 /*! \brief Compute forces and/or energies for special algorithms
672  *
673  * The intention is to collect all calls to algorithms that compute
674  * forces on local atoms only and that do not contribute to the local
675  * virial sum (but add their virial contribution separately).
676  * Eventually these should likely all become ForceProviders.
677  * Within this function the intention is to have algorithms that do
678  * global communication at the end, so global barriers within the MD loop
679  * are as close together as possible.
680  *
681  * \param[in]     fplog            The log file
682  * \param[in]     cr               The communication record
683  * \param[in]     inputrec         The input record
684  * \param[in]     awh              The Awh module (nullptr if none in use).
685  * \param[in]     enforcedRotation Enforced rotation module.
686  * \param[in]     step             The current MD step
687  * \param[in]     t                The current time
688  * \param[in,out] wcycle           Wallcycle accounting struct
689  * \param[in,out] forceProviders   Pointer to a list of force providers
690  * \param[in]     box              The unit cell
691  * \param[in]     x                The coordinates
692  * \param[in]     mdatoms          Per atom properties
693  * \param[in]     lambda           Array of free-energy lambda values
694  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
695  * \param[in,out] forceWithVirial  Force and virial buffers
696  * \param[in,out] enerd            Energy buffer
697  * \param[in,out] ed               Essential dynamics pointer
698  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
699  *
700  * \todo Remove bNS, which is used incorrectly.
701  * \todo Convert all other algorithms called here to ForceProviders.
702  */
703 static void
704 computeSpecialForces(FILE                          *fplog,
705                      const t_commrec               *cr,
706                      const t_inputrec              *inputrec,
707                      gmx::Awh                      *awh,
708                      gmx_enfrot                    *enforcedRotation,
709                      int64_t                        step,
710                      double                         t,
711                      gmx_wallcycle_t                wcycle,
712                      ForceProviders                *forceProviders,
713                      matrix                         box,
714                      gmx::ArrayRef<const gmx::RVec> x,
715                      const t_mdatoms               *mdatoms,
716                      real                          *lambda,
717                      int                            forceFlags,
718                      gmx::ForceWithVirial          *forceWithVirial,
719                      gmx_enerdata_t                *enerd,
720                      gmx_edsam                     *ed,
721                      gmx_bool                       bNS)
722 {
723     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
724
725     /* NOTE: Currently all ForceProviders only provide forces.
726      *       When they also provide energies, remove this conditional.
727      */
728     if (computeForces)
729     {
730         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
731         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
732
733         /* Collect forces from modules */
734         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
735     }
736
737     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
738     {
739         pull_potential_wrapper(cr, inputrec, box, x,
740                                forceWithVirial,
741                                mdatoms, enerd, lambda, t,
742                                wcycle);
743
744         if (awh)
745         {
746             enerd->term[F_COM_PULL] +=
747                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
748                                                   forceWithVirial,
749                                                   t, step, wcycle, fplog);
750         }
751     }
752
753     rvec *f = as_rvec_array(forceWithVirial->force_.data());
754
755     /* Add the forces from enforced rotation potentials (if any) */
756     if (inputrec->bRot)
757     {
758         wallcycle_start(wcycle, ewcROTadd);
759         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
760         wallcycle_stop(wcycle, ewcROTadd);
761     }
762
763     if (ed)
764     {
765         /* Note that since init_edsam() is called after the initialization
766          * of forcerec, edsam doesn't request the noVirSum force buffer.
767          * Thus if no other algorithm (e.g. PME) requires it, the forces
768          * here will contribute to the virial.
769          */
770         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
771     }
772
773     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
774     if (inputrec->bIMD && computeForces)
775     {
776         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
777     }
778 }
779
780 /*! \brief Launch the prepare_step and spread stages of PME GPU.
781  *
782  * \param[in]  pmedata       The PME structure
783  * \param[in]  box           The box matrix
784  * \param[in]  x             Coordinate array
785  * \param[in]  flags         Force flags
786  * \param[in]  pmeFlags      PME flags
787  * \param[in]  wcycle        The wallcycle structure
788  */
789 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
790                                       matrix          box,
791                                       rvec            x[],
792                                       int             flags,
793                                       int             pmeFlags,
794                                       gmx_wallcycle_t wcycle)
795 {
796     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
797     pme_gpu_launch_spread(pmedata, x, wcycle);
798 }
799
800 /*! \brief Launch the FFT and gather stages of PME GPU
801  *
802  * This function only implements setting the output forces (no accumulation).
803  *
804  * \param[in]  pmedata        The PME structure
805  * \param[in]  wcycle         The wallcycle structure
806  */
807 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
808                                      gmx_wallcycle_t   wcycle)
809 {
810     pme_gpu_launch_complex_transforms(pmedata, wcycle);
811     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
812 }
813
814 /*! \brief
815  *  Polling wait for either of the PME or nonbonded GPU tasks.
816  *
817  * Instead of a static order in waiting for GPU tasks, this function
818  * polls checking which of the two tasks completes first, and does the
819  * associated force buffer reduction overlapped with the other task.
820  * By doing that, unlike static scheduling order, it can always overlap
821  * one of the reductions, regardless of the GPU task completion order.
822  *
823  * \param[in]     nbv              Nonbonded verlet structure
824  * \param[in,out] pmedata          PME module data
825  * \param[in,out] force            Force array to reduce task outputs into.
826  * \param[in,out] forceWithVirial  Force and virial buffers
827  * \param[in,out] fshift           Shift force output vector results are reduced into
828  * \param[in,out] enerd            Energy data structure results are reduced into
829  * \param[in]     flags            Force flags
830  * \param[in]     pmeFlags         PME flags
831  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
832  * \param[in]     wcycle           The wallcycle structure
833  */
834 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
835                                         gmx_pme_t                           *pmedata,
836                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
837                                         gmx::ForceWithVirial                *forceWithVirial,
838                                         rvec                                 fshift[],
839                                         gmx_enerdata_t                      *enerd,
840                                         int                                  flags,
841                                         int                                  pmeFlags,
842                                         bool                                 haveOtherWork,
843                                         gmx_wallcycle_t                      wcycle)
844 {
845     bool isPmeGpuDone = false;
846     bool isNbGpuDone  = false;
847
848
849     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
850
851     while (!isPmeGpuDone || !isNbGpuDone)
852     {
853         if (!isPmeGpuDone)
854         {
855             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
856             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
857         }
858
859         if (!isNbGpuDone)
860         {
861             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
862             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
863             isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
864                                                      flags,
865                                                      Nbnxm::AtomLocality::Local,
866                                                      haveOtherWork,
867                                                      enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
868                                                      fshift, completionType);
869             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
870             // To get the call count right, when the task finished we
871             // issue a start/stop.
872             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
873             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
874             if (isNbGpuDone)
875             {
876                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
877                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
878
879                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
880                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
881             }
882         }
883     }
884 }
885
886 /*! \brief
887  *  Launch the dynamic rolling pruning GPU task.
888  *
889  *  We currently alternate local/non-local list pruning in odd-even steps
890  *  (only pruning every second step without DD).
891  *
892  * \param[in]     cr               The communication record
893  * \param[in]     nbv              Nonbonded verlet structure
894  * \param[in]     inputrec         The input record
895  * \param[in]     step             The current MD step
896  */
897 static inline void launchGpuRollingPruning(const t_commrec          *cr,
898                                            const nonbonded_verlet_t *nbv,
899                                            const t_inputrec         *inputrec,
900                                            const int64_t             step)
901 {
902     /* We should not launch the rolling pruning kernel at a search
903      * step or just before search steps, since that's useless.
904      * Without domain decomposition we prune at even steps.
905      * With domain decomposition we alternate local and non-local
906      * pruning at even and odd steps.
907      */
908     int  numRollingParts     = nbv->listParams->numRollingParts;
909     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
910     int  stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
911     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
912     if (stepWithCurrentList > 0 &&
913         stepWithCurrentList < inputrec->nstlist - 1 &&
914         (stepIsEven || DOMAINDECOMP(cr)))
915     {
916         Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
917                                            stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
918                                            numRollingParts);
919     }
920 }
921
922 static void do_force_cutsVERLET(FILE *fplog,
923                                 const t_commrec *cr,
924                                 const gmx_multisim_t *ms,
925                                 const t_inputrec *inputrec,
926                                 gmx::Awh *awh,
927                                 gmx_enfrot *enforcedRotation,
928                                 int64_t step,
929                                 t_nrnb *nrnb,
930                                 gmx_wallcycle_t wcycle,
931                                 const gmx_localtop_t *top,
932                                 const gmx_groups_t * /* groups */,
933                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
934                                 history_t *hist,
935                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
936                                 tensor vir_force,
937                                 const t_mdatoms *mdatoms,
938                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
939                                 real *lambda,
940                                 t_graph *graph,
941                                 t_forcerec *fr,
942                                 gmx::PpForceWorkload *ppForceWorkload,
943                                 interaction_const_t *ic,
944                                 const gmx_vsite_t *vsite,
945                                 rvec mu_tot,
946                                 double t,
947                                 gmx_edsam *ed,
948                                 const int flags,
949                                 const DDBalanceRegionHandler &ddBalanceRegionHandler)
950 {
951     int                 cg1, i, j;
952     double              mu[2*DIM];
953     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
954     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
955     rvec                vzero, box_diag;
956     float               cycles_pme, cycles_wait_gpu;
957     nonbonded_verlet_t *nbv = fr->nbv;
958
959     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
960     bNS           = ((flags & GMX_FORCE_NS) != 0);
961     bFillGrid     = (bNS && bStateChanged);
962     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
963     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
964     bUseGPU       = fr->nbv->bUseGPU;
965     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
966
967     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
968     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
969     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
970         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
971     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
972         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
973         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
974         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
975
976     /* At a search step we need to start the first balancing region
977      * somewhere early inside the step after communication during domain
978      * decomposition (and not during the previous step as usual).
979      */
980     if (bNS)
981     {
982         ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
983     }
984
985     cycles_wait_gpu = 0;
986
987     const int start  = 0;
988     const int homenr = mdatoms->homenr;
989
990     clear_mat(vir_force);
991
992     if (DOMAINDECOMP(cr))
993     {
994         cg1 = cr->dd->globalAtomGroupIndices.size();
995     }
996     else
997     {
998         cg1 = top->cgs.nr;
999     }
1000     if (fr->n_tpi > 0)
1001     {
1002         cg1--;
1003     }
1004
1005     if (bStateChanged)
1006     {
1007         update_forcerec(fr, box);
1008
1009         if (inputrecNeedMutot(inputrec))
1010         {
1011             /* Calculate total (local) dipole moment in a temporary common array.
1012              * This makes it possible to sum them over nodes faster.
1013              */
1014             calc_mu(start, homenr,
1015                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1016                     mu, mu+DIM);
1017         }
1018     }
1019
1020     if (fr->ePBC != epbcNONE)
1021     {
1022         /* Compute shift vectors every step,
1023          * because of pressure coupling or box deformation!
1024          */
1025         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1026         {
1027             calc_shifts(box, fr->shift_vec);
1028         }
1029
1030         if (bCalcCGCM)
1031         {
1032             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1033             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1034         }
1035         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1036         {
1037             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1038         }
1039     }
1040
1041     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1042                                  fr->shift_vec, nbv->nbat);
1043
1044 #if GMX_MPI
1045     if (!thisRankHasDuty(cr, DUTY_PME))
1046     {
1047         /* Send particle coordinates to the pme nodes.
1048          * Since this is only implemented for domain decomposition
1049          * and domain decomposition does not use the graph,
1050          * we do not need to worry about shifting.
1051          */
1052         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1053                                  lambda[efptCOUL], lambda[efptVDW],
1054                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1055                                  step, wcycle);
1056     }
1057 #endif /* GMX_MPI */
1058
1059     if (useGpuPme)
1060     {
1061         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1062     }
1063
1064     /* do gridding for pair search */
1065     if (bNS)
1066     {
1067         if (graph && bStateChanged)
1068         {
1069             /* Calculate intramolecular shift vectors to make molecules whole */
1070             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1071         }
1072
1073         clear_rvec(vzero);
1074         box_diag[XX] = box[XX][XX];
1075         box_diag[YY] = box[YY][YY];
1076         box_diag[ZZ] = box[ZZ][ZZ];
1077
1078         wallcycle_start(wcycle, ewcNS);
1079         if (!DOMAINDECOMP(cr))
1080         {
1081             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1082             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1083                               0, vzero, box_diag,
1084                               nullptr, 0, mdatoms->homenr, -1,
1085                               fr->cginfo, x.unpaddedArrayRef(),
1086                               0, nullptr,
1087                               nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1088                               nbv->nbat);
1089             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1090         }
1091         else
1092         {
1093             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1094             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1095                                        fr->cginfo, x.unpaddedArrayRef(),
1096                                        nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1097                                        nbv->nbat);
1098             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1099         }
1100
1101         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1102
1103         wallcycle_stop(wcycle, ewcNS);
1104     }
1105
1106     /* initialize the GPU atom data and copy shift vector */
1107     if (bUseGPU)
1108     {
1109         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1110         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1111
1112         if (bNS)
1113         {
1114             Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1115         }
1116
1117         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1118
1119         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1120
1121         if (bNS && fr->gpuBonded)
1122         {
1123             /* Now we put all atoms on the grid, we can assign bonded
1124              * interactions to the GPU, where the grid order is
1125              * needed. Also the xq, f and fshift device buffers have
1126              * been reallocated if needed, so the bonded code can
1127              * learn about them. */
1128             // TODO the xq, f, and fshift buffers are now shared
1129             // resources, so they should be maintained by a
1130             // higher-level object than the nb module.
1131             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1132                                                                   top->idef,
1133                                                                   Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1134                                                                   Nbnxm::gpu_get_f(nbv->gpu_nbv),
1135                                                                   Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1136             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1137         }
1138
1139         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1140     }
1141
1142     /* do local pair search */
1143     if (bNS)
1144     {
1145         nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
1146
1147         wallcycle_start_nocount(wcycle, ewcNS);
1148         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1149         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1150                             &top->excls,
1151                             nbv->listParams->rlistOuter,
1152                             nbv->min_ci_balanced,
1153                             &pairlistSet,
1154                             Nbnxm::InteractionLocality::Local,
1155                             nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1156                             nrnb);
1157         pairlistSet.outerListCreationStep = step;
1158         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1159         {
1160             nbnxnPrepareListForDynamicPruning(&pairlistSet);
1161         }
1162         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1163
1164         if (bUseGPU)
1165         {
1166             /* initialize local pair-list on the GPU */
1167             Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1168                                      pairlistSet.nblGpu[0],
1169                                      Nbnxm::InteractionLocality::Local);
1170         }
1171         wallcycle_stop(wcycle, ewcNS);
1172     }
1173     else
1174     {
1175         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1176                                         FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1177                                         nbv->nbat, wcycle);
1178     }
1179
1180     if (bUseGPU)
1181     {
1182         ddBalanceRegionHandler.openBeforeForceComputationGpu();
1183
1184         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1185
1186         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1187         Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1188         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1189
1190         // bonded work not split into separate local and non-local, so with DD
1191         // we can only launch the kernel after non-local coordinates have been received.
1192         if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1193         {
1194             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1195             fr->gpuBonded->launchKernels(fr, flags, box);
1196             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1197         }
1198
1199         /* launch local nonbonded work on GPU */
1200         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1201         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1202                      step, nrnb, wcycle);
1203         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1204         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1205     }
1206
1207     if (useGpuPme)
1208     {
1209         // In PME GPU and mixed mode we launch FFT / gather after the
1210         // X copy/transform to allow overlap as well as after the GPU NB
1211         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1212         // the nonbonded kernel.
1213         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1214     }
1215
1216     /* Communicate coordinates and sum dipole if necessary +
1217        do non-local pair search */
1218     if (DOMAINDECOMP(cr))
1219     {
1220         nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
1221
1222         if (bNS)
1223         {
1224             wallcycle_start_nocount(wcycle, ewcNS);
1225             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1226
1227             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1228                                 &top->excls,
1229                                 nbv->listParams->rlistOuter,
1230                                 nbv->min_ci_balanced,
1231                                 &pairlistSet,
1232                                 Nbnxm::InteractionLocality::NonLocal,
1233                                 nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1234                                 nrnb);
1235             pairlistSet.outerListCreationStep = step;
1236             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1237             {
1238                 nbnxnPrepareListForDynamicPruning(&pairlistSet);
1239             }
1240             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1241
1242             if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
1243             {
1244                 /* initialize non-local pair-list on the GPU */
1245                 Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1246                                          pairlistSet.nblGpu[0],
1247                                          Nbnxm::InteractionLocality::NonLocal);
1248             }
1249             wallcycle_stop(wcycle, ewcNS);
1250         }
1251         else
1252         {
1253             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1254
1255             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1256                                             FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1257                                             nbv->nbat, wcycle);
1258         }
1259
1260         if (bUseGPU)
1261         {
1262             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1263
1264             /* launch non-local nonbonded tasks on GPU */
1265             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1266             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1267             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1268
1269             if (ppForceWorkload->haveGpuBondedWork)
1270             {
1271                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1272                 fr->gpuBonded->launchKernels(fr, flags, box);
1273                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1274             }
1275
1276             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1277             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1278                          step, nrnb, wcycle);
1279             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1280
1281             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1282         }
1283     }
1284
1285     if (bUseGPU)
1286     {
1287         /* launch D2H copy-back F */
1288         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1289         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1290         if (DOMAINDECOMP(cr))
1291         {
1292             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1293                                       flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1294         }
1295         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1296                                   flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1297         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1298
1299         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1300         {
1301             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1302             fr->gpuBonded->launchEnergyTransfer();
1303             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1304         }
1305         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1306     }
1307
1308     if (bStateChanged && inputrecNeedMutot(inputrec))
1309     {
1310         if (PAR(cr))
1311         {
1312             gmx_sumd(2*DIM, mu, cr);
1313
1314             ddBalanceRegionHandler.reopenRegionCpu();
1315         }
1316
1317         for (i = 0; i < 2; i++)
1318         {
1319             for (j = 0; j < DIM; j++)
1320             {
1321                 fr->mu_tot[i][j] = mu[i*DIM + j];
1322             }
1323         }
1324     }
1325     if (fr->efep == efepNO)
1326     {
1327         copy_rvec(fr->mu_tot[0], mu_tot);
1328     }
1329     else
1330     {
1331         for (j = 0; j < DIM; j++)
1332         {
1333             mu_tot[j] =
1334                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1335                 lambda[efptCOUL]*fr->mu_tot[1][j];
1336         }
1337     }
1338
1339     /* Reset energies */
1340     reset_enerdata(enerd);
1341     clear_rvecs(SHIFTS, fr->fshift);
1342
1343     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1344     {
1345         wallcycle_start(wcycle, ewcPPDURINGPME);
1346         dd_force_flop_start(cr->dd, nrnb);
1347     }
1348
1349     if (inputrec->bRot)
1350     {
1351         wallcycle_start(wcycle, ewcROT);
1352         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1353         wallcycle_stop(wcycle, ewcROT);
1354     }
1355
1356     /* Temporary solution until all routines take PaddedRVecVector */
1357     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1358
1359     /* Start the force cycle counter.
1360      * Note that a different counter is used for dynamic load balancing.
1361      */
1362     wallcycle_start(wcycle, ewcFORCE);
1363
1364     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1365     if (bDoForces)
1366     {
1367         /* If we need to compute the virial, we might need a separate
1368          * force buffer for algorithms for which the virial is calculated
1369          * directly, such as PME.
1370          */
1371         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1372         {
1373             forceRef = *fr->forceBufferForDirectVirialContributions;
1374
1375             /* TODO: update comment
1376              * We only compute forces on local atoms. Note that vsites can
1377              * spread to non-local atoms, but that part of the buffer is
1378              * cleared separately in the vsite spreading code.
1379              */
1380             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1381         }
1382         /* Clear the short- and long-range forces */
1383         clear_rvecs_omp(fr->natoms_force_constr, f);
1384     }
1385
1386     /* forceWithVirial uses the local atom range only */
1387     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1388
1389     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1390     {
1391         clear_pull_forces(inputrec->pull_work);
1392     }
1393
1394     /* We calculate the non-bonded forces, when done on the CPU, here.
1395      * We do this before calling do_force_lowlevel, because in that
1396      * function, the listed forces are calculated before PME, which
1397      * does communication.  With this order, non-bonded and listed
1398      * force calculation imbalance can be balanced out by the domain
1399      * decomposition load balancing.
1400      */
1401
1402     if (!bUseOrEmulGPU)
1403     {
1404         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1405                      step, nrnb, wcycle);
1406     }
1407
1408     if (fr->efep != efepNO)
1409     {
1410         /* Calculate the local and non-local free energy interactions here.
1411          * Happens here on the CPU both with and without GPU.
1412          */
1413         if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
1414         {
1415             do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
1416                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1417                              inputrec->fepvals, lambda,
1418                              enerd, flags, nrnb, wcycle);
1419         }
1420
1421         if (DOMAINDECOMP(cr) &&
1422             fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1423         {
1424             do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
1425                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1426                              inputrec->fepvals, lambda,
1427                              enerd, flags, nrnb, wcycle);
1428         }
1429     }
1430
1431     if (!bUseOrEmulGPU)
1432     {
1433         if (DOMAINDECOMP(cr))
1434         {
1435             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1436                          step, nrnb, wcycle);
1437         }
1438
1439         const Nbnxm::InteractionLocality iloc =
1440             (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1441
1442         /* Add all the non-bonded force to the normal force array.
1443          * This can be split into a local and a non-local part when overlapping
1444          * communication with calculation with domain decomposition.
1445          */
1446         wallcycle_stop(wcycle, ewcFORCE);
1447
1448         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1449
1450         wallcycle_start_nocount(wcycle, ewcFORCE);
1451
1452         /* if there are multiple fshift output buffers reduce them */
1453         if ((flags & GMX_FORCE_VIRIAL) &&
1454             nbv->grp[iloc].nbl_lists.nnbl > 1)
1455         {
1456             /* This is not in a subcounter because it takes a
1457                negligible and constant-sized amount of time */
1458             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1459                                                      fr->fshift);
1460         }
1461     }
1462
1463     /* update QMMMrec, if necessary */
1464     if (fr->bQMMM)
1465     {
1466         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1467     }
1468
1469     /* Compute the bonded and non-bonded energies and optionally forces */
1470     do_force_lowlevel(fr, inputrec, &(top->idef),
1471                       cr, ms, nrnb, wcycle, mdatoms,
1472                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1473                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1474                       flags,
1475                       &cycles_pme, ddBalanceRegionHandler);
1476
1477     wallcycle_stop(wcycle, ewcFORCE);
1478
1479     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1480                          step, t, wcycle,
1481                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1482                          flags, &forceWithVirial, enerd,
1483                          ed, bNS);
1484
1485     if (bUseOrEmulGPU)
1486     {
1487         /* wait for non-local forces (or calculate in emulation mode) */
1488         if (DOMAINDECOMP(cr))
1489         {
1490             if (bUseGPU)
1491             {
1492                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1493                 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1494                                             flags, Nbnxm::AtomLocality::NonLocal,
1495                                             ppForceWorkload->haveGpuBondedWork,
1496                                             enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1497                                             fr->fshift);
1498                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1499             }
1500             else
1501             {
1502                 wallcycle_start_nocount(wcycle, ewcFORCE);
1503                 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1504                              step, nrnb, wcycle);
1505                 wallcycle_stop(wcycle, ewcFORCE);
1506             }
1507
1508             /* skip the reduction if there was no non-local work to do */
1509             if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
1510             {
1511                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1512                                                nbv->nbat, f, wcycle);
1513             }
1514         }
1515     }
1516
1517     if (DOMAINDECOMP(cr))
1518     {
1519         /* We are done with the CPU compute.
1520          * We will now communicate the non-local forces.
1521          * If we use a GPU this will overlap with GPU work, so in that case
1522          * we do not close the DD force balancing region here.
1523          */
1524         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1525
1526         if (bDoForces)
1527         {
1528             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1529         }
1530     }
1531
1532     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1533     // an alternating wait/reduction scheme.
1534     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1535     if (alternateGpuWait)
1536     {
1537         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1538     }
1539
1540     if (!alternateGpuWait && useGpuPme)
1541     {
1542         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1543     }
1544
1545     /* Wait for local GPU NB outputs on the non-alternating wait path */
1546     if (!alternateGpuWait && bUseGPU)
1547     {
1548         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1549          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1550          * but even with a step of 0.1 ms the difference is less than 1%
1551          * of the step time.
1552          */
1553         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1554
1555         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1556         Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1557                                     flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1558                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1559                                     fr->fshift);
1560         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1561
1562         if (ddBalanceRegionHandler.useBalancingRegion())
1563         {
1564             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1565             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1566             {
1567                 /* We measured few cycles, it could be that the kernel
1568                  * and transfer finished earlier and there was no actual
1569                  * wait time, only API call overhead.
1570                  * Then the actual time could be anywhere between 0 and
1571                  * cycles_wait_est. We will use half of cycles_wait_est.
1572                  */
1573                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1574             }
1575             ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
1576         }
1577     }
1578
1579     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1580     {
1581         // NOTE: emulation kernel is not included in the balancing region,
1582         // but emulation mode does not target performance anyway
1583         wallcycle_start_nocount(wcycle, ewcFORCE);
1584         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1585                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1586                      step, nrnb, wcycle);
1587         wallcycle_stop(wcycle, ewcFORCE);
1588     }
1589
1590     if (useGpuPme)
1591     {
1592         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1593     }
1594
1595     if (bUseGPU)
1596     {
1597         /* now clear the GPU outputs while we finish the step on the CPU */
1598         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1599         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1600         Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1601
1602         /* Is dynamic pair-list pruning activated? */
1603         if (nbv->listParams->useDynamicPruning)
1604         {
1605             launchGpuRollingPruning(cr, nbv, inputrec, step);
1606         }
1607         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1608         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1609     }
1610
1611     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1612     {
1613         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1614         // in principle this should be included in the DD balancing region,
1615         // but generally it is infrequent so we'll omit it for the sake of
1616         // simpler code
1617         fr->gpuBonded->accumulateEnergyTerms(enerd);
1618         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1619
1620         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1621         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1622         fr->gpuBonded->clearEnergies();
1623         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1624         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1625     }
1626
1627     /* Do the nonbonded GPU (or emulation) force buffer reduction
1628      * on the non-alternating path. */
1629     if (bUseOrEmulGPU && !alternateGpuWait)
1630     {
1631         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1632                                        nbv->nbat, f, wcycle);
1633     }
1634     if (DOMAINDECOMP(cr))
1635     {
1636         dd_force_flop_stop(cr->dd, nrnb);
1637     }
1638
1639     if (bDoForces)
1640     {
1641         /* If we have NoVirSum forces, but we do not calculate the virial,
1642          * we sum fr->f_novirsum=f later.
1643          */
1644         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1645         {
1646             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1647                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1648         }
1649
1650         if (flags & GMX_FORCE_VIRIAL)
1651         {
1652             /* Calculation of the virial must be done after vsites! */
1653             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1654                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1655         }
1656     }
1657
1658     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1659     {
1660         /* In case of node-splitting, the PP nodes receive the long-range
1661          * forces, virial and energy from the PME nodes here.
1662          */
1663         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1664     }
1665
1666     if (bDoForces)
1667     {
1668         post_process_forces(cr, step, nrnb, wcycle,
1669                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1670                             vir_force, mdatoms, graph, fr, vsite,
1671                             flags);
1672     }
1673
1674     if (flags & GMX_FORCE_ENERGY)
1675     {
1676         /* Sum the potential energy terms from group contributions */
1677         sum_epot(&(enerd->grpp), enerd->term);
1678
1679         if (!EI_TPI(inputrec->eI))
1680         {
1681             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1682         }
1683     }
1684 }
1685
1686 static void do_force_cutsGROUP(FILE *fplog,
1687                                const t_commrec *cr,
1688                                const gmx_multisim_t *ms,
1689                                const t_inputrec *inputrec,
1690                                gmx::Awh *awh,
1691                                gmx_enfrot *enforcedRotation,
1692                                int64_t step,
1693                                t_nrnb *nrnb,
1694                                gmx_wallcycle_t wcycle,
1695                                gmx_localtop_t *top,
1696                                const gmx_groups_t *groups,
1697                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1698                                history_t *hist,
1699                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1700                                tensor vir_force,
1701                                const t_mdatoms *mdatoms,
1702                                gmx_enerdata_t *enerd,
1703                                t_fcdata *fcd,
1704                                real *lambda,
1705                                t_graph *graph,
1706                                t_forcerec *fr,
1707                                const gmx_vsite_t *vsite,
1708                                rvec mu_tot,
1709                                double t,
1710                                gmx_edsam *ed,
1711                                int flags,
1712                                const DDBalanceRegionHandler &ddBalanceRegionHandler)
1713 {
1714     int        cg0, cg1, i, j;
1715     double     mu[2*DIM];
1716     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1717     gmx_bool   bDoForces;
1718     float      cycles_pme;
1719
1720     const int  start  = 0;
1721     const int  homenr = mdatoms->homenr;
1722
1723     clear_mat(vir_force);
1724
1725     cg0 = 0;
1726     if (DOMAINDECOMP(cr))
1727     {
1728         cg1 = cr->dd->globalAtomGroupIndices.size();
1729     }
1730     else
1731     {
1732         cg1 = top->cgs.nr;
1733     }
1734     if (fr->n_tpi > 0)
1735     {
1736         cg1--;
1737     }
1738
1739     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1740     bNS            = ((flags & GMX_FORCE_NS) != 0);
1741     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1742     bFillGrid      = (bNS && bStateChanged);
1743     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1744     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1745
1746     if (bStateChanged)
1747     {
1748         update_forcerec(fr, box);
1749
1750         if (inputrecNeedMutot(inputrec))
1751         {
1752             /* Calculate total (local) dipole moment in a temporary common array.
1753              * This makes it possible to sum them over nodes faster.
1754              */
1755             calc_mu(start, homenr,
1756                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1757                     mu, mu+DIM);
1758         }
1759     }
1760
1761     if (fr->ePBC != epbcNONE)
1762     {
1763         /* Compute shift vectors every step,
1764          * because of pressure coupling or box deformation!
1765          */
1766         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1767         {
1768             calc_shifts(box, fr->shift_vec);
1769         }
1770
1771         if (bCalcCGCM)
1772         {
1773             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1774                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1775             inc_nrnb(nrnb, eNR_CGCM, homenr);
1776             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1777         }
1778         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1779         {
1780             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1781         }
1782     }
1783     else if (bCalcCGCM)
1784     {
1785         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1786         inc_nrnb(nrnb, eNR_CGCM, homenr);
1787     }
1788
1789     if (bCalcCGCM && gmx_debug_at)
1790     {
1791         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1792     }
1793
1794 #if GMX_MPI
1795     if (!thisRankHasDuty(cr, DUTY_PME))
1796     {
1797         /* Send particle coordinates to the pme nodes.
1798          * Since this is only implemented for domain decomposition
1799          * and domain decomposition does not use the graph,
1800          * we do not need to worry about shifting.
1801          */
1802         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1803                                  lambda[efptCOUL], lambda[efptVDW],
1804                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1805                                  step, wcycle);
1806     }
1807 #endif /* GMX_MPI */
1808
1809     /* Communicate coordinates and sum dipole if necessary */
1810     if (DOMAINDECOMP(cr))
1811     {
1812         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1813
1814         /* No GPU support, no move_x overlap, so reopen the balance region here */
1815         ddBalanceRegionHandler.reopenRegionCpu();
1816     }
1817
1818     if (inputrecNeedMutot(inputrec))
1819     {
1820         if (bStateChanged)
1821         {
1822             if (PAR(cr))
1823             {
1824                 gmx_sumd(2*DIM, mu, cr);
1825
1826                 ddBalanceRegionHandler.reopenRegionCpu();
1827             }
1828             for (i = 0; i < 2; i++)
1829             {
1830                 for (j = 0; j < DIM; j++)
1831                 {
1832                     fr->mu_tot[i][j] = mu[i*DIM + j];
1833                 }
1834             }
1835         }
1836         if (fr->efep == efepNO)
1837         {
1838             copy_rvec(fr->mu_tot[0], mu_tot);
1839         }
1840         else
1841         {
1842             for (j = 0; j < DIM; j++)
1843             {
1844                 mu_tot[j] =
1845                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1846             }
1847         }
1848     }
1849
1850     /* Reset energies */
1851     reset_enerdata(enerd);
1852     clear_rvecs(SHIFTS, fr->fshift);
1853
1854     if (bNS)
1855     {
1856         wallcycle_start(wcycle, ewcNS);
1857
1858         if (graph && bStateChanged)
1859         {
1860             /* Calculate intramolecular shift vectors to make molecules whole */
1861             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1862         }
1863
1864         /* Do the actual neighbour searching */
1865         ns(fplog, fr, box,
1866            groups, top, mdatoms,
1867            cr, nrnb, bFillGrid);
1868
1869         wallcycle_stop(wcycle, ewcNS);
1870     }
1871
1872     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1873     {
1874         wallcycle_start(wcycle, ewcPPDURINGPME);
1875         dd_force_flop_start(cr->dd, nrnb);
1876     }
1877
1878     if (inputrec->bRot)
1879     {
1880         wallcycle_start(wcycle, ewcROT);
1881         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1882         wallcycle_stop(wcycle, ewcROT);
1883     }
1884
1885     /* Temporary solution until all routines take PaddedRVecVector */
1886     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1887
1888     /* Start the force cycle counter.
1889      * Note that a different counter is used for dynamic load balancing.
1890      */
1891     wallcycle_start(wcycle, ewcFORCE);
1892
1893     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1894     if (bDoForces)
1895     {
1896         /* If we need to compute the virial, we might need a separate
1897          * force buffer for algorithms for which the virial is calculated
1898          * separately, such as PME.
1899          */
1900         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1901         {
1902             forceRef = *fr->forceBufferForDirectVirialContributions;
1903
1904             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1905         }
1906
1907         /* Clear the short- and long-range forces */
1908         clear_rvecs(fr->natoms_force_constr, f);
1909     }
1910
1911     /* forceWithVirial might need the full force atom range */
1912     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1913
1914     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1915     {
1916         clear_pull_forces(inputrec->pull_work);
1917     }
1918
1919     /* update QMMMrec, if necessary */
1920     if (fr->bQMMM)
1921     {
1922         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1923     }
1924
1925     /* Compute the bonded and non-bonded energies and optionally forces */
1926     do_force_lowlevel(fr, inputrec, &(top->idef),
1927                       cr, ms, nrnb, wcycle, mdatoms,
1928                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1929                       box, inputrec->fepvals, lambda,
1930                       graph, &(top->excls), fr->mu_tot,
1931                       flags,
1932                       &cycles_pme, ddBalanceRegionHandler);
1933
1934     wallcycle_stop(wcycle, ewcFORCE);
1935
1936     if (DOMAINDECOMP(cr))
1937     {
1938         dd_force_flop_stop(cr->dd, nrnb);
1939
1940         ddBalanceRegionHandler.closeAfterForceComputationCpu();
1941     }
1942
1943     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1944                          step, t, wcycle,
1945                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1946                          flags, &forceWithVirial, enerd,
1947                          ed, bNS);
1948
1949     if (bDoForces)
1950     {
1951         /* Communicate the forces */
1952         if (DOMAINDECOMP(cr))
1953         {
1954             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1955             /* Do we need to communicate the separate force array
1956              * for terms that do not contribute to the single sum virial?
1957              * Position restraints and electric fields do not introduce
1958              * inter-cg forces, only full electrostatics methods do.
1959              * When we do not calculate the virial, fr->f_novirsum = f,
1960              * so we have already communicated these forces.
1961              */
1962             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1963                 (flags & GMX_FORCE_VIRIAL))
1964             {
1965                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1966             }
1967         }
1968
1969         /* If we have NoVirSum forces, but we do not calculate the virial,
1970          * we sum fr->f_novirsum=f later.
1971          */
1972         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1973         {
1974             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1975                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1976         }
1977
1978         if (flags & GMX_FORCE_VIRIAL)
1979         {
1980             /* Calculation of the virial must be done after vsites! */
1981             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1982                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1983         }
1984     }
1985
1986     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1987     {
1988         /* In case of node-splitting, the PP nodes receive the long-range
1989          * forces, virial and energy from the PME nodes here.
1990          */
1991         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1992     }
1993
1994     if (bDoForces)
1995     {
1996         post_process_forces(cr, step, nrnb, wcycle,
1997                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1998                             vir_force, mdatoms, graph, fr, vsite,
1999                             flags);
2000     }
2001
2002     if (flags & GMX_FORCE_ENERGY)
2003     {
2004         /* Sum the potential energy terms from group contributions */
2005         sum_epot(&(enerd->grpp), enerd->term);
2006
2007         if (!EI_TPI(inputrec->eI))
2008         {
2009             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2010         }
2011     }
2012
2013 }
2014
2015 void do_force(FILE                                     *fplog,
2016               const t_commrec                          *cr,
2017               const gmx_multisim_t                     *ms,
2018               const t_inputrec                         *inputrec,
2019               gmx::Awh                                 *awh,
2020               gmx_enfrot                               *enforcedRotation,
2021               int64_t                                   step,
2022               t_nrnb                                   *nrnb,
2023               gmx_wallcycle_t                           wcycle,
2024               gmx_localtop_t                           *top,
2025               const gmx_groups_t                       *groups,
2026               matrix                                    box,
2027               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
2028               history_t                                *hist,
2029               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
2030               tensor                                    vir_force,
2031               const t_mdatoms                          *mdatoms,
2032               gmx_enerdata_t                           *enerd,
2033               t_fcdata                                 *fcd,
2034               gmx::ArrayRef<real>                       lambda,
2035               t_graph                                  *graph,
2036               t_forcerec                               *fr,
2037               gmx::PpForceWorkload                     *ppForceWorkload,
2038               const gmx_vsite_t                        *vsite,
2039               rvec                                      mu_tot,
2040               double                                    t,
2041               gmx_edsam                                *ed,
2042               int                                       flags,
2043               const DDBalanceRegionHandler             &ddBalanceRegionHandler)
2044 {
2045     /* modify force flag if not doing nonbonded */
2046     if (!fr->bNonbonded)
2047     {
2048         flags &= ~GMX_FORCE_NONBONDED;
2049     }
2050
2051     switch (inputrec->cutoff_scheme)
2052     {
2053         case ecutsVERLET:
2054             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2055                                 awh, enforcedRotation, step, nrnb, wcycle,
2056                                 top,
2057                                 groups,
2058                                 box, x, hist,
2059                                 force, vir_force,
2060                                 mdatoms,
2061                                 enerd, fcd,
2062                                 lambda.data(), graph,
2063                                 fr,
2064                                 ppForceWorkload,
2065                                 fr->ic,
2066                                 vsite, mu_tot,
2067                                 t, ed,
2068                                 flags,
2069                                 ddBalanceRegionHandler);
2070             break;
2071         case ecutsGROUP:
2072             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2073                                awh, enforcedRotation, step, nrnb, wcycle,
2074                                top,
2075                                groups,
2076                                box, x, hist,
2077                                force, vir_force,
2078                                mdatoms,
2079                                enerd, fcd,
2080                                lambda.data(), graph,
2081                                fr, vsite, mu_tot,
2082                                t, ed,
2083                                flags,
2084                                ddBalanceRegionHandler);
2085             break;
2086         default:
2087             gmx_incons("Invalid cut-off scheme passed!");
2088     }
2089
2090     /* In case we don't have constraints and are using GPUs, the next balancing
2091      * region starts here.
2092      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2093      * virial calculation and COM pulling, is not thus not included in
2094      * the balance timing, which is ok as most tasks do communication.
2095      */
2096     ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2097 }
2098
2099
2100 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2101                         const t_inputrec *ir, const t_mdatoms *md,
2102                         t_state *state)
2103 {
2104     int             i, m, start, end;
2105     int64_t         step;
2106     real            dt = ir->delta_t;
2107     real            dvdl_dum;
2108     rvec           *savex;
2109
2110     /* We need to allocate one element extra, since we might use
2111      * (unaligned) 4-wide SIMD loads to access rvec entries.
2112      */
2113     snew(savex, state->natoms + 1);
2114
2115     start = 0;
2116     end   = md->homenr;
2117
2118     if (debug)
2119     {
2120         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2121                 start, md->homenr, end);
2122     }
2123     /* Do a first constrain to reset particles... */
2124     step = ir->init_step;
2125     if (fplog)
2126     {
2127         char buf[STEPSTRSIZE];
2128         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2129                 gmx_step_str(step, buf));
2130     }
2131     dvdl_dum = 0;
2132
2133     /* constrain the current position */
2134     constr->apply(TRUE, FALSE,
2135                   step, 0, 1.0,
2136                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2137                   state->box,
2138                   state->lambda[efptBONDED], &dvdl_dum,
2139                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2140     if (EI_VV(ir->eI))
2141     {
2142         /* constrain the inital velocity, and save it */
2143         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2144         constr->apply(TRUE, FALSE,
2145                       step, 0, 1.0,
2146                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2147                       state->box,
2148                       state->lambda[efptBONDED], &dvdl_dum,
2149                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2150     }
2151     /* constrain the inital velocities at t-dt/2 */
2152     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2153     {
2154         auto x = makeArrayRef(state->x).subArray(start, end);
2155         auto v = makeArrayRef(state->v).subArray(start, end);
2156         for (i = start; (i < end); i++)
2157         {
2158             for (m = 0; (m < DIM); m++)
2159             {
2160                 /* Reverse the velocity */
2161                 v[i][m] = -v[i][m];
2162                 /* Store the position at t-dt in buf */
2163                 savex[i][m] = x[i][m] + dt*v[i][m];
2164             }
2165         }
2166         /* Shake the positions at t=-dt with the positions at t=0
2167          * as reference coordinates.
2168          */
2169         if (fplog)
2170         {
2171             char buf[STEPSTRSIZE];
2172             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2173                     gmx_step_str(step, buf));
2174         }
2175         dvdl_dum = 0;
2176         constr->apply(TRUE, FALSE,
2177                       step, -1, 1.0,
2178                       state->x.rvec_array(), savex, nullptr,
2179                       state->box,
2180                       state->lambda[efptBONDED], &dvdl_dum,
2181                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2182
2183         for (i = start; i < end; i++)
2184         {
2185             for (m = 0; m < DIM; m++)
2186             {
2187                 /* Re-reverse the velocities */
2188                 v[i][m] = -v[i][m];
2189             }
2190         }
2191     }
2192     sfree(savex);
2193 }
2194
2195
2196 static void
2197 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2198                 double *enerout, double *virout)
2199 {
2200     double enersum, virsum;
2201     double invscale, invscale2, invscale3;
2202     double r, ea, eb, ec, pa, pb, pc, pd;
2203     double y0, f, g, h;
2204     int    ri, offset;
2205     double tabfactor;
2206
2207     invscale  = 1.0/scale;
2208     invscale2 = invscale*invscale;
2209     invscale3 = invscale*invscale2;
2210
2211     /* Following summation derived from cubic spline definition,
2212      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2213      * the cubic spline.  We first calculate the negative of the
2214      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2215      * add the more standard, abrupt cutoff correction to that result,
2216      * yielding the long-range correction for a switched function.  We
2217      * perform both the pressure and energy loops at the same time for
2218      * simplicity, as the computational cost is low. */
2219
2220     if (offstart == 0)
2221     {
2222         /* Since the dispersion table has been scaled down a factor
2223          * 6.0 and the repulsion a factor 12.0 to compensate for the
2224          * c6/c12 parameters inside nbfp[] being scaled up (to save
2225          * flops in kernels), we need to correct for this.
2226          */
2227         tabfactor = 6.0;
2228     }
2229     else
2230     {
2231         tabfactor = 12.0;
2232     }
2233
2234     enersum = 0.0;
2235     virsum  = 0.0;
2236     for (ri = rstart; ri < rend; ++ri)
2237     {
2238         r  = ri*invscale;
2239         ea = invscale3;
2240         eb = 2.0*invscale2*r;
2241         ec = invscale*r*r;
2242
2243         pa = invscale3;
2244         pb = 3.0*invscale2*r;
2245         pc = 3.0*invscale*r*r;
2246         pd = r*r*r;
2247
2248         /* this "8" is from the packing in the vdwtab array - perhaps
2249            should be defined? */
2250
2251         offset = 8*ri + offstart;
2252         y0     = vdwtab[offset];
2253         f      = vdwtab[offset+1];
2254         g      = vdwtab[offset+2];
2255         h      = vdwtab[offset+3];
2256
2257         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2258         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2259     }
2260     *enerout = 4.0*M_PI*enersum*tabfactor;
2261     *virout  = 4.0*M_PI*virsum*tabfactor;
2262 }
2263
2264 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2265 {
2266     double   eners[2], virs[2], enersum, virsum;
2267     double   r0, rc3, rc9;
2268     int      ri0, ri1, i;
2269     real     scale, *vdwtab;
2270
2271     fr->enershiftsix    = 0;
2272     fr->enershifttwelve = 0;
2273     fr->enerdiffsix     = 0;
2274     fr->enerdifftwelve  = 0;
2275     fr->virdiffsix      = 0;
2276     fr->virdifftwelve   = 0;
2277
2278     const interaction_const_t *ic = fr->ic;
2279
2280     if (eDispCorr != edispcNO)
2281     {
2282         for (i = 0; i < 2; i++)
2283         {
2284             eners[i] = 0;
2285             virs[i]  = 0;
2286         }
2287         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2288             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2289             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2290             (ic->vdwtype == evdwSHIFT) ||
2291             (ic->vdwtype == evdwSWITCH))
2292         {
2293             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2294                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2295                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2296             {
2297                 gmx_fatal(FARGS,
2298                           "With dispersion correction rvdw-switch can not be zero "
2299                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2300             }
2301
2302             /* TODO This code depends on the logic in tables.c that
2303                constructs the table layout, which should be made
2304                explicit in future cleanup. */
2305             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2306                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2307             scale  = fr->dispersionCorrectionTable->scale;
2308             vdwtab = fr->dispersionCorrectionTable->data;
2309
2310             /* Round the cut-offs to exact table values for precision */
2311             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2312             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2313
2314             /* The code below has some support for handling force-switching, i.e.
2315              * when the force (instead of potential) is switched over a limited
2316              * region. This leads to a constant shift in the potential inside the
2317              * switching region, which we can handle by adding a constant energy
2318              * term in the force-switch case just like when we do potential-shift.
2319              *
2320              * For now this is not enabled, but to keep the functionality in the
2321              * code we check separately for switch and shift. When we do force-switch
2322              * the shifting point is rvdw_switch, while it is the cutoff when we
2323              * have a classical potential-shift.
2324              *
2325              * For a pure potential-shift the potential has a constant shift
2326              * all the way out to the cutoff, and that is it. For other forms
2327              * we need to calculate the constant shift up to the point where we
2328              * start modifying the potential.
2329              */
2330             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2331
2332             r0   = ri0/scale;
2333             rc3  = r0*r0*r0;
2334             rc9  = rc3*rc3*rc3;
2335
2336             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2337                 (ic->vdwtype == evdwSHIFT))
2338             {
2339                 /* Determine the constant energy shift below rvdw_switch.
2340                  * Table has a scale factor since we have scaled it down to compensate
2341                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2342                  */
2343                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2344                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2345             }
2346             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2347             {
2348                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2349                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2350             }
2351
2352             /* Add the constant part from 0 to rvdw_switch.
2353              * This integration from 0 to rvdw_switch overcounts the number
2354              * of interactions by 1, as it also counts the self interaction.
2355              * We will correct for this later.
2356              */
2357             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2358             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2359
2360             /* Calculate the contribution in the range [r0,r1] where we
2361              * modify the potential. For a pure potential-shift modifier we will
2362              * have ri0==ri1, and there will not be any contribution here.
2363              */
2364             for (i = 0; i < 2; i++)
2365             {
2366                 enersum = 0;
2367                 virsum  = 0;
2368                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2369                 eners[i] -= enersum;
2370                 virs[i]  -= virsum;
2371             }
2372
2373             /* Alright: Above we compensated by REMOVING the parts outside r0
2374              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2375              *
2376              * Regardless of whether r0 is the point where we start switching,
2377              * or the cutoff where we calculated the constant shift, we include
2378              * all the parts we are missing out to infinity from r0 by
2379              * calculating the analytical dispersion correction.
2380              */
2381             eners[0] += -4.0*M_PI/(3.0*rc3);
2382             eners[1] +=  4.0*M_PI/(9.0*rc9);
2383             virs[0]  +=  8.0*M_PI/rc3;
2384             virs[1]  += -16.0*M_PI/(3.0*rc9);
2385         }
2386         else if (ic->vdwtype == evdwCUT ||
2387                  EVDW_PME(ic->vdwtype) ||
2388                  ic->vdwtype == evdwUSER)
2389         {
2390             if (ic->vdwtype == evdwUSER && fplog)
2391             {
2392                 fprintf(fplog,
2393                         "WARNING: using dispersion correction with user tables\n");
2394             }
2395
2396             /* Note that with LJ-PME, the dispersion correction is multiplied
2397              * by the difference between the actual C6 and the value of C6
2398              * that would produce the combination rule.
2399              * This means the normal energy and virial difference formulas
2400              * can be used here.
2401              */
2402
2403             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2404             rc9  = rc3*rc3*rc3;
2405             /* Contribution beyond the cut-off */
2406             eners[0] += -4.0*M_PI/(3.0*rc3);
2407             eners[1] +=  4.0*M_PI/(9.0*rc9);
2408             if (ic->vdw_modifier == eintmodPOTSHIFT)
2409             {
2410                 /* Contribution within the cut-off */
2411                 eners[0] += -4.0*M_PI/(3.0*rc3);
2412                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2413             }
2414             /* Contribution beyond the cut-off */
2415             virs[0]  +=  8.0*M_PI/rc3;
2416             virs[1]  += -16.0*M_PI/(3.0*rc9);
2417         }
2418         else
2419         {
2420             gmx_fatal(FARGS,
2421                       "Dispersion correction is not implemented for vdw-type = %s",
2422                       evdw_names[ic->vdwtype]);
2423         }
2424
2425         /* When we deprecate the group kernels the code below can go too */
2426         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2427         {
2428             /* Calculate self-interaction coefficient (assuming that
2429              * the reciprocal-space contribution is constant in the
2430              * region that contributes to the self-interaction).
2431              */
2432             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2433
2434             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2435             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2436         }
2437
2438         fr->enerdiffsix    = eners[0];
2439         fr->enerdifftwelve = eners[1];
2440         /* The 0.5 is due to the Gromacs definition of the virial */
2441         fr->virdiffsix     = 0.5*virs[0];
2442         fr->virdifftwelve  = 0.5*virs[1];
2443     }
2444 }
2445
2446 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2447                    const matrix box, real lambda, tensor pres, tensor virial,
2448                    real *prescorr, real *enercorr, real *dvdlcorr)
2449 {
2450     gmx_bool bCorrAll, bCorrPres;
2451     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2452     int      m;
2453
2454     *prescorr = 0;
2455     *enercorr = 0;
2456     *dvdlcorr = 0;
2457
2458     clear_mat(virial);
2459     clear_mat(pres);
2460
2461     if (ir->eDispCorr != edispcNO)
2462     {
2463         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2464                      ir->eDispCorr == edispcAllEnerPres);
2465         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2466                      ir->eDispCorr == edispcAllEnerPres);
2467
2468         invvol = 1/det(box);
2469         if (fr->n_tpi)
2470         {
2471             /* Only correct for the interactions with the inserted molecule */
2472             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2473             ninter = fr->n_tpi;
2474         }
2475         else
2476         {
2477             dens   = fr->numAtomsForDispersionCorrection*invvol;
2478             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2479         }
2480
2481         if (ir->efep == efepNO)
2482         {
2483             avcsix    = fr->avcsix[0];
2484             avctwelve = fr->avctwelve[0];
2485         }
2486         else
2487         {
2488             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2489             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2490         }
2491
2492         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2493         *enercorr += avcsix*enerdiff;
2494         dvdlambda  = 0.0;
2495         if (ir->efep != efepNO)
2496         {
2497             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2498         }
2499         if (bCorrAll)
2500         {
2501             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2502             *enercorr += avctwelve*enerdiff;
2503             if (fr->efep != efepNO)
2504             {
2505                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2506             }
2507         }
2508
2509         if (bCorrPres)
2510         {
2511             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2512             if (ir->eDispCorr == edispcAllEnerPres)
2513             {
2514                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2515             }
2516             /* The factor 2 is because of the Gromacs virial definition */
2517             spres = -2.0*invvol*svir*PRESFAC;
2518
2519             for (m = 0; m < DIM; m++)
2520             {
2521                 virial[m][m] += svir;
2522                 pres[m][m]   += spres;
2523             }
2524             *prescorr += spres;
2525         }
2526
2527         /* Can't currently control when it prints, for now, just print when degugging */
2528         if (debug)
2529         {
2530             if (bCorrAll)
2531             {
2532                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2533                         avcsix, avctwelve);
2534             }
2535             if (bCorrPres)
2536             {
2537                 fprintf(debug,
2538                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2539                         *enercorr, spres, svir);
2540             }
2541             else
2542             {
2543                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2544             }
2545         }
2546
2547         if (fr->efep != efepNO)
2548         {
2549             *dvdlcorr += dvdlambda;
2550         }
2551     }
2552 }
2553
2554 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2555                             const gmx_mtop_t *mtop, rvec x[],
2556                             gmx_bool bFirst)
2557 {
2558     t_graph        *graph;
2559     int             as, mol;
2560
2561     if (bFirst && fplog)
2562     {
2563         fprintf(fplog, "Removing pbc first time\n");
2564     }
2565
2566     snew(graph, 1);
2567     as = 0;
2568     for (const gmx_molblock_t &molb : mtop->molblock)
2569     {
2570         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2571         if (moltype.atoms.nr == 1 ||
2572             (!bFirst && moltype.cgs.nr == 1))
2573         {
2574             /* Just one atom or charge group in the molecule, no PBC required */
2575             as += molb.nmol*moltype.atoms.nr;
2576         }
2577         else
2578         {
2579             mk_graph_moltype(moltype, graph);
2580
2581             for (mol = 0; mol < molb.nmol; mol++)
2582             {
2583                 mk_mshift(fplog, graph, ePBC, box, x+as);
2584
2585                 shift_self(graph, box, x+as);
2586                 /* The molecule is whole now.
2587                  * We don't need the second mk_mshift call as in do_pbc_first,
2588                  * since we no longer need this graph.
2589                  */
2590
2591                 as += moltype.atoms.nr;
2592             }
2593             done_graph(graph);
2594         }
2595     }
2596     sfree(graph);
2597 }
2598
2599 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2600                        const gmx_mtop_t *mtop, rvec x[])
2601 {
2602     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2603 }
2604
2605 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2606                  const gmx_mtop_t *mtop, rvec x[])
2607 {
2608     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2609 }
2610
2611 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2612 {
2613     int t, nth;
2614     nth = gmx_omp_nthreads_get(emntDefault);
2615
2616 #pragma omp parallel for num_threads(nth) schedule(static)
2617     for (t = 0; t < nth; t++)
2618     {
2619         try
2620         {
2621             size_t natoms = x.size();
2622             size_t offset = (natoms*t    )/nth;
2623             size_t len    = (natoms*(t + 1))/nth - offset;
2624             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2625         }
2626         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2627     }
2628 }
2629
2630 // TODO This can be cleaned up a lot, and move back to runner.cpp
2631 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2632                 const t_inputrec *inputrec,
2633                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2634                 gmx_walltime_accounting_t walltime_accounting,
2635                 nonbonded_verlet_t *nbv,
2636                 const gmx_pme_t *pme,
2637                 gmx_bool bWriteStat)
2638 {
2639     t_nrnb *nrnb_tot = nullptr;
2640     double  delta_t  = 0;
2641     double  nbfs     = 0, mflop = 0;
2642     double  elapsed_time,
2643             elapsed_time_over_all_ranks,
2644             elapsed_time_over_all_threads,
2645             elapsed_time_over_all_threads_over_all_ranks;
2646     /* Control whether it is valid to print a report. Only the
2647        simulation master may print, but it should not do so if the run
2648        terminated e.g. before a scheduled reset step. This is
2649        complicated by the fact that PME ranks are unaware of the
2650        reason why they were sent a pmerecvqxFINISH. To avoid
2651        communication deadlocks, we always do the communication for the
2652        report, even if we've decided not to write the report, because
2653        how long it takes to finish the run is not important when we've
2654        decided not to report on the simulation performance.
2655
2656        Further, we only report performance for dynamical integrators,
2657        because those are the only ones for which we plan to
2658        consider doing any optimizations. */
2659     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2660
2661     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2662     {
2663         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2664         printReport = false;
2665     }
2666
2667     if (cr->nnodes > 1)
2668     {
2669         snew(nrnb_tot, 1);
2670 #if GMX_MPI
2671         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2672                       cr->mpi_comm_mysim);
2673 #endif
2674     }
2675     else
2676     {
2677         nrnb_tot = nrnb;
2678     }
2679
2680     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2681     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2682     if (cr->nnodes > 1)
2683     {
2684 #if GMX_MPI
2685         /* reduce elapsed_time over all MPI ranks in the current simulation */
2686         MPI_Allreduce(&elapsed_time,
2687                       &elapsed_time_over_all_ranks,
2688                       1, MPI_DOUBLE, MPI_SUM,
2689                       cr->mpi_comm_mysim);
2690         elapsed_time_over_all_ranks /= cr->nnodes;
2691         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2692          * current simulation. */
2693         MPI_Allreduce(&elapsed_time_over_all_threads,
2694                       &elapsed_time_over_all_threads_over_all_ranks,
2695                       1, MPI_DOUBLE, MPI_SUM,
2696                       cr->mpi_comm_mysim);
2697 #endif
2698     }
2699     else
2700     {
2701         elapsed_time_over_all_ranks                  = elapsed_time;
2702         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2703     }
2704
2705     if (printReport)
2706     {
2707         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2708     }
2709     if (cr->nnodes > 1)
2710     {
2711         sfree(nrnb_tot);
2712     }
2713
2714     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2715     {
2716         print_dd_statistics(cr, inputrec, fplog);
2717     }
2718
2719     /* TODO Move the responsibility for any scaling by thread counts
2720      * to the code that handled the thread region, so that there's a
2721      * mechanism to keep cycle counting working during the transition
2722      * to task parallelism. */
2723     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2724     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2725     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2726     auto cycle_sum(wallcycle_sum(cr, wcycle));
2727
2728     if (printReport)
2729     {
2730         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2731         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2732         if (pme_gpu_task_enabled(pme))
2733         {
2734             pme_gpu_get_timings(pme, &pme_gpu_timings);
2735         }
2736         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2737                         elapsed_time_over_all_ranks,
2738                         wcycle, cycle_sum,
2739                         nbnxn_gpu_timings,
2740                         &pme_gpu_timings);
2741
2742         if (EI_DYNAMICS(inputrec->eI))
2743         {
2744             delta_t = inputrec->delta_t;
2745         }
2746
2747         if (fplog)
2748         {
2749             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2750                        elapsed_time_over_all_ranks,
2751                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2752                        delta_t, nbfs, mflop);
2753         }
2754         if (bWriteStat)
2755         {
2756             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2757                        elapsed_time_over_all_ranks,
2758                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2759                        delta_t, nbfs, mflop);
2760         }
2761     }
2762 }
2763
2764 void initialize_lambdas(FILE               *fplog,
2765                         const t_inputrec   &ir,
2766                         bool                isMaster,
2767                         int                *fep_state,
2768                         gmx::ArrayRef<real> lambda,
2769                         double             *lam0)
2770 {
2771     /* TODO: Clean up initialization of fep_state and lambda in
2772        t_state.  This function works, but could probably use a logic
2773        rewrite to keep all the different types of efep straight. */
2774
2775     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2776     {
2777         return;
2778     }
2779
2780     const t_lambda *fep = ir.fepvals;
2781     if (isMaster)
2782     {
2783         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2784                                              if checkpoint is set -- a kludge is in for now
2785                                              to prevent this.*/
2786     }
2787
2788     for (int i = 0; i < efptNR; i++)
2789     {
2790         double thisLambda;
2791         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2792         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2793         {
2794             thisLambda = fep->init_lambda;
2795         }
2796         else
2797         {
2798             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2799         }
2800         if (isMaster)
2801         {
2802             lambda[i] = thisLambda;
2803         }
2804         if (lam0 != nullptr)
2805         {
2806             lam0[i] = thisLambda;
2807         }
2808     }
2809     if (ir.bSimTemp)
2810     {
2811         /* need to rescale control temperatures to match current state */
2812         for (int i = 0; i < ir.opts.ngtc; i++)
2813         {
2814             if (ir.opts.ref_t[i] > 0)
2815             {
2816                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2817             }
2818         }
2819     }
2820
2821     /* Send to the log the information on the current lambdas */
2822     if (fplog != nullptr)
2823     {
2824         fprintf(fplog, "Initial vector of lambda components:[ ");
2825         for (const auto &l : lambda)
2826         {
2827             fprintf(fplog, "%10.4f ", l);
2828         }
2829         fprintf(fplog, "]\n");
2830     }
2831 }