2fd4038c55134c4f43ac6d3a564c07cc70935137
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
47
48 #include <array>
49
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
117
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
123
124
125 void print_time(FILE                     *out,
126                 gmx_walltime_accounting_t walltime_accounting,
127                 int64_t                   step,
128                 const t_inputrec         *ir,
129                 const t_commrec          *cr)
130 {
131     time_t finish;
132     double dt, elapsed_seconds, time_per_step;
133
134 #if !GMX_THREAD_MPI
135     if (!PAR(cr))
136 #endif
137     {
138         fprintf(out, "\r");
139     }
140     fputs("step ", out);
141     fputs(gmx::int64ToString(step).c_str(), out);
142     fflush(out);
143
144     if ((step >= ir->nstlist))
145     {
146         double seconds_since_epoch = gmx_gettime();
147         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
149         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
150
151         if (ir->nsteps >= 0)
152         {
153             if (dt >= 300)
154             {
155                 finish = static_cast<time_t>(seconds_since_epoch + dt);
156                 auto timebuf = gmx_ctime_r(&finish);
157                 timebuf.erase(timebuf.find_first_of('\n'));
158                 fputs(", will finish ", out);
159                 fputs(timebuf.c_str(), out);
160             }
161             else
162             {
163                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
164             }
165         }
166         else
167         {
168             fprintf(out, " performance: %.1f ns/day    ",
169                     ir->delta_t/1000*24*60*60/time_per_step);
170         }
171     }
172 #if !GMX_THREAD_MPI
173     if (PAR(cr))
174     {
175         fprintf(out, "\n");
176     }
177 #else
178     GMX_UNUSED_VALUE(cr);
179 #endif
180
181     fflush(out);
182 }
183
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185                          double the_time)
186 {
187     if (!fplog)
188     {
189         return;
190     }
191
192     time_t temp_time = static_cast<time_t>(the_time);
193
194     auto   timebuf = gmx_ctime_r(&temp_time);
195
196     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
197 }
198
199 void print_start(FILE *fplog, const t_commrec *cr,
200                  gmx_walltime_accounting_t walltime_accounting,
201                  const char *name)
202 {
203     char buf[STRLEN];
204
205     sprintf(buf, "Started %s", name);
206     print_date_and_time(fplog, cr->nodeid, buf,
207                         walltime_accounting_get_start_time_stamp(walltime_accounting));
208 }
209
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
211 {
212     const int      end = forceToAdd.size();
213
214     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216     for (int i = 0; i < end; i++)
217     {
218         rvec_inc(f[i], forceToAdd[i]);
219     }
220 }
221
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223                         tensor vir_part, const t_graph *graph, const matrix box,
224                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
225 {
226     /* The short-range virial from surrounding boxes */
227     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
229
230     /* Calculate partial virial, for local atoms only, based on short range.
231      * Total virial is computed in global_stat, called from do_md
232      */
233     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
235
236     if (debug)
237     {
238         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
239     }
240 }
241
242 static void pull_potential_wrapper(const t_commrec *cr,
243                                    const t_inputrec *ir,
244                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245                                    gmx::ForceWithVirial *force,
246                                    const t_mdatoms *mdatoms,
247                                    gmx_enerdata_t *enerd,
248                                    const real *lambda,
249                                    double t,
250                                    gmx_wallcycle_t wcycle)
251 {
252     t_pbc  pbc;
253     real   dvdl;
254
255     /* Calculate the center of mass forces, this requires communication,
256      * which is why pull_potential is called close to other communication.
257      */
258     wallcycle_start(wcycle, ewcPULLPOT);
259     set_pbc(&pbc, ir->ePBC, box);
260     dvdl                     = 0;
261     enerd->term[F_COM_PULL] +=
262         pull_potential(ir->pull_work, mdatoms, &pbc,
263                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265     wallcycle_stop(wcycle, ewcPULLPOT);
266 }
267
268 static void pme_receive_force_ener(const t_commrec      *cr,
269                                    gmx::ForceWithVirial *forceWithVirial,
270                                    gmx_enerdata_t       *enerd,
271                                    gmx_wallcycle_t       wcycle)
272 {
273     real   e_q, e_lj, dvdl_q, dvdl_lj;
274     float  cycles_ppdpme, cycles_seppme;
275
276     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
278
279     /* In case of node-splitting, the PP nodes receive the long-range
280      * forces, virial and energy from the PME nodes here.
281      */
282     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
283     dvdl_q  = 0;
284     dvdl_lj = 0;
285     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
286                       &cycles_seppme);
287     enerd->term[F_COUL_RECIP] += e_q;
288     enerd->term[F_LJ_RECIP]   += e_lj;
289     enerd->dvdl_lin[efptCOUL] += dvdl_q;
290     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
291
292     if (wcycle)
293     {
294         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
295     }
296     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
297 }
298
299 static void print_large_forces(FILE            *fp,
300                                const t_mdatoms *md,
301                                const t_commrec *cr,
302                                int64_t          step,
303                                real             forceTolerance,
304                                const rvec      *x,
305                                const rvec      *f)
306 {
307     real           force2Tolerance = gmx::square(forceTolerance);
308     gmx::index     numNonFinite    = 0;
309     for (int i = 0; i < md->homenr; i++)
310     {
311         real force2    = norm2(f[i]);
312         bool nonFinite = !std::isfinite(force2);
313         if (force2 >= force2Tolerance || nonFinite)
314         {
315             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
316                     step,
317                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
318         }
319         if (nonFinite)
320         {
321             numNonFinite++;
322         }
323     }
324     if (numNonFinite > 0)
325     {
326         /* Note that with MPI this fatal call on one rank might interrupt
327          * the printing on other ranks. But we can only avoid that with
328          * an expensive MPI barrier that we would need at each step.
329          */
330         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
331     }
332 }
333
334 static void post_process_forces(const t_commrec           *cr,
335                                 int64_t                    step,
336                                 t_nrnb                    *nrnb,
337                                 gmx_wallcycle_t            wcycle,
338                                 const gmx_localtop_t      *top,
339                                 const matrix               box,
340                                 const rvec                 x[],
341                                 rvec                       f[],
342                                 gmx::ForceWithVirial      *forceWithVirial,
343                                 tensor                     vir_force,
344                                 const t_mdatoms           *mdatoms,
345                                 const t_graph             *graph,
346                                 const t_forcerec          *fr,
347                                 const gmx_vsite_t         *vsite,
348                                 int                        flags)
349 {
350     if (fr->haveDirectVirialContributions)
351     {
352         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
353
354         if (vsite)
355         {
356             /* Spread the mesh force on virtual sites to the other particles...
357              * This is parallellized. MPI communication is performed
358              * if the constructing atoms aren't local.
359              */
360             matrix virial = { { 0 } };
361             spread_vsite_f(vsite, x, fDirectVir, nullptr,
362                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
363                            nrnb,
364                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365             forceWithVirial->addVirialContribution(virial);
366         }
367
368         if (flags & GMX_FORCE_VIRIAL)
369         {
370             /* Now add the forces, this is local */
371             sum_forces(f, forceWithVirial->force_);
372
373             /* Add the direct virial contributions */
374             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
376
377             if (debug)
378             {
379                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
380             }
381         }
382     }
383
384     if (fr->print_force >= 0)
385     {
386         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
387     }
388 }
389
390 static void do_nb_verlet(t_forcerec *fr,
391                          const interaction_const_t *ic,
392                          gmx_enerdata_t *enerd,
393                          int flags, int ilocality,
394                          int clearF,
395                          int64_t step,
396                          t_nrnb *nrnb,
397                          gmx_wallcycle_t wcycle)
398 {
399     if (!(flags & GMX_FORCE_NONBONDED))
400     {
401         /* skip non-bonded calculation */
402         return;
403     }
404
405     nonbonded_verlet_t       *nbv  = fr->nbv;
406     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
407
408     /* GPU kernel launch overhead is already timed separately */
409     if (fr->cutoff_scheme != ecutsVERLET)
410     {
411         gmx_incons("Invalid cut-off scheme passed!");
412     }
413
414     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
415
416     if (!bUsingGpuKernels)
417     {
418         /* When dynamic pair-list  pruning is requested, we need to prune
419          * at nstlistPrune steps.
420          */
421         if (nbv->listParams->useDynamicPruning &&
422             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
423         {
424             /* Prune the pair-list beyond fr->ic->rlistPrune using
425              * the current coordinates of the atoms.
426              */
427             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
428             NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
429             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
430         }
431
432         wallcycle_sub_start(wcycle, ewcsNONBONDED);
433     }
434
435     NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
436
437     if (!bUsingGpuKernels)
438     {
439         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
440     }
441 }
442
443 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
444                              t_forcerec           *fr,
445                              rvec                  x[],
446                              rvec                  f[],
447                              const t_mdatoms      *mdatoms,
448                              t_lambda             *fepvals,
449                              real                 *lambda,
450                              gmx_enerdata_t       *enerd,
451                              int                   flags,
452                              t_nrnb               *nrnb,
453                              gmx_wallcycle_t       wcycle)
454 {
455     int              donb_flags;
456     nb_kernel_data_t kernel_data;
457     real             lam_i[efptNR];
458     real             dvdl_nb[efptNR];
459     int              th;
460     int              i, j;
461
462     donb_flags = 0;
463     /* Add short-range interactions */
464     donb_flags |= GMX_NONBONDED_DO_SR;
465
466     /* Currently all group scheme kernels always calculate (shift-)forces */
467     if (flags & GMX_FORCE_FORCES)
468     {
469         donb_flags |= GMX_NONBONDED_DO_FORCE;
470     }
471     if (flags & GMX_FORCE_VIRIAL)
472     {
473         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
474     }
475     if (flags & GMX_FORCE_ENERGY)
476     {
477         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
478     }
479
480     kernel_data.flags  = donb_flags;
481     kernel_data.lambda = lambda;
482     kernel_data.dvdl   = dvdl_nb;
483
484     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
485     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
486
487     /* reset free energy components */
488     for (i = 0; i < efptNR; i++)
489     {
490         dvdl_nb[i]  = 0;
491     }
492
493     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
494
495     wallcycle_sub_start(wcycle, ewcsNONBONDED);
496 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
497     for (th = 0; th < nbl_lists->nnbl; th++)
498     {
499         try
500         {
501             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
502                                       x, f, fr, mdatoms, &kernel_data, nrnb);
503         }
504         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
505     }
506
507     if (fepvals->sc_alpha != 0)
508     {
509         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
510         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
511     }
512     else
513     {
514         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
515         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
516     }
517
518     /* If we do foreign lambda and we have soft-core interactions
519      * we have to recalculate the (non-linear) energies contributions.
520      */
521     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
522     {
523         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
524         kernel_data.lambda         = lam_i;
525         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
526         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
527         /* Note that we add to kernel_data.dvdl, but ignore the result */
528
529         for (i = 0; i < enerd->n_lambda; i++)
530         {
531             for (j = 0; j < efptNR; j++)
532             {
533                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
534             }
535             reset_foreign_enerdata(enerd);
536 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
537             for (th = 0; th < nbl_lists->nnbl; th++)
538             {
539                 try
540                 {
541                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
542                                               x, f, fr, mdatoms, &kernel_data, nrnb);
543                 }
544                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
545             }
546
547             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
548             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
549         }
550     }
551
552     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
553 }
554
555 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
556 {
557     return nbv != nullptr && nbv->bUseGPU;
558 }
559
560 static inline void clear_rvecs_omp(int n, rvec v[])
561 {
562     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
563
564     /* Note that we would like to avoid this conditional by putting it
565      * into the omp pragma instead, but then we still take the full
566      * omp parallel for overhead (at least with gcc5).
567      */
568     if (nth == 1)
569     {
570         for (int i = 0; i < n; i++)
571         {
572             clear_rvec(v[i]);
573         }
574     }
575     else
576     {
577 #pragma omp parallel for num_threads(nth) schedule(static)
578         for (int i = 0; i < n; i++)
579         {
580             clear_rvec(v[i]);
581         }
582     }
583 }
584
585 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
586  *
587  * \param groupOptions  Group options, containing T-coupling options
588  */
589 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
590 {
591     real nrdfCoupled   = 0;
592     real nrdfUncoupled = 0;
593     real kineticEnergy = 0;
594     for (int g = 0; g < groupOptions.ngtc; g++)
595     {
596         if (groupOptions.tau_t[g] >= 0)
597         {
598             nrdfCoupled   += groupOptions.nrdf[g];
599             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
600         }
601         else
602         {
603             nrdfUncoupled += groupOptions.nrdf[g];
604         }
605     }
606
607     /* This conditional with > also catches nrdf=0 */
608     if (nrdfCoupled > nrdfUncoupled)
609     {
610         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
611     }
612     else
613     {
614         return 0;
615     }
616 }
617
618 /*! \brief This routine checks that the potential energy is finite.
619  *
620  * Always checks that the potential energy is finite. If step equals
621  * inputrec.init_step also checks that the magnitude of the potential energy
622  * is reasonable. Terminates with a fatal error when a check fails.
623  * Note that passing this check does not guarantee finite forces,
624  * since those use slightly different arithmetics. But in most cases
625  * there is just a narrow coordinate range where forces are not finite
626  * and energies are finite.
627  *
628  * \param[in] step      The step number, used for checking and printing
629  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
630  * \param[in] inputrec  The input record
631  */
632 static void checkPotentialEnergyValidity(int64_t               step,
633                                          const gmx_enerdata_t &enerd,
634                                          const t_inputrec     &inputrec)
635 {
636     /* Threshold valid for comparing absolute potential energy against
637      * the kinetic energy. Normally one should not consider absolute
638      * potential energy values, but with a factor of one million
639      * we should never get false positives.
640      */
641     constexpr real c_thresholdFactor = 1e6;
642
643     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
644     real           averageKineticEnergy = 0;
645     /* We only check for large potential energy at the initial step,
646      * because that is by far the most likely step for this too occur
647      * and because computing the average kinetic energy is not free.
648      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
649      * before they become NaN.
650      */
651     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
652     {
653         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
654     }
655
656     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
657                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
658     {
659         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
660                   step,
661                   enerd.term[F_EPOT],
662                   energyIsNotFinite ? "not finite" : "extremely high",
663                   enerd.term[F_LJ],
664                   enerd.term[F_COUL_SR],
665                   energyIsNotFinite ? "non-finite" : "very high",
666                   energyIsNotFinite ? " or Nan" : "");
667     }
668 }
669
670 /*! \brief Compute forces and/or energies for special algorithms
671  *
672  * The intention is to collect all calls to algorithms that compute
673  * forces on local atoms only and that do not contribute to the local
674  * virial sum (but add their virial contribution separately).
675  * Eventually these should likely all become ForceProviders.
676  * Within this function the intention is to have algorithms that do
677  * global communication at the end, so global barriers within the MD loop
678  * are as close together as possible.
679  *
680  * \param[in]     fplog            The log file
681  * \param[in]     cr               The communication record
682  * \param[in]     inputrec         The input record
683  * \param[in]     awh              The Awh module (nullptr if none in use).
684  * \param[in]     enforcedRotation Enforced rotation module.
685  * \param[in]     step             The current MD step
686  * \param[in]     t                The current time
687  * \param[in,out] wcycle           Wallcycle accounting struct
688  * \param[in,out] forceProviders   Pointer to a list of force providers
689  * \param[in]     box              The unit cell
690  * \param[in]     x                The coordinates
691  * \param[in]     mdatoms          Per atom properties
692  * \param[in]     lambda           Array of free-energy lambda values
693  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
694  * \param[in,out] forceWithVirial  Force and virial buffers
695  * \param[in,out] enerd            Energy buffer
696  * \param[in,out] ed               Essential dynamics pointer
697  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
698  *
699  * \todo Remove bNS, which is used incorrectly.
700  * \todo Convert all other algorithms called here to ForceProviders.
701  */
702 static void
703 computeSpecialForces(FILE                          *fplog,
704                      const t_commrec               *cr,
705                      const t_inputrec              *inputrec,
706                      gmx::Awh                      *awh,
707                      gmx_enfrot                    *enforcedRotation,
708                      int64_t                        step,
709                      double                         t,
710                      gmx_wallcycle_t                wcycle,
711                      ForceProviders                *forceProviders,
712                      matrix                         box,
713                      gmx::ArrayRef<const gmx::RVec> x,
714                      const t_mdatoms               *mdatoms,
715                      real                          *lambda,
716                      int                            forceFlags,
717                      gmx::ForceWithVirial          *forceWithVirial,
718                      gmx_enerdata_t                *enerd,
719                      gmx_edsam                     *ed,
720                      gmx_bool                       bNS)
721 {
722     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
723
724     /* NOTE: Currently all ForceProviders only provide forces.
725      *       When they also provide energies, remove this conditional.
726      */
727     if (computeForces)
728     {
729         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
730         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
731
732         /* Collect forces from modules */
733         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
734     }
735
736     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
737     {
738         pull_potential_wrapper(cr, inputrec, box, x,
739                                forceWithVirial,
740                                mdatoms, enerd, lambda, t,
741                                wcycle);
742
743         if (awh)
744         {
745             enerd->term[F_COM_PULL] +=
746                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
747                                                   forceWithVirial,
748                                                   t, step, wcycle, fplog);
749         }
750     }
751
752     rvec *f = as_rvec_array(forceWithVirial->force_.data());
753
754     /* Add the forces from enforced rotation potentials (if any) */
755     if (inputrec->bRot)
756     {
757         wallcycle_start(wcycle, ewcROTadd);
758         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
759         wallcycle_stop(wcycle, ewcROTadd);
760     }
761
762     if (ed)
763     {
764         /* Note that since init_edsam() is called after the initialization
765          * of forcerec, edsam doesn't request the noVirSum force buffer.
766          * Thus if no other algorithm (e.g. PME) requires it, the forces
767          * here will contribute to the virial.
768          */
769         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
770     }
771
772     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
773     if (inputrec->bIMD && computeForces)
774     {
775         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
776     }
777 }
778
779 /*! \brief Launch the prepare_step and spread stages of PME GPU.
780  *
781  * \param[in]  pmedata       The PME structure
782  * \param[in]  box           The box matrix
783  * \param[in]  x             Coordinate array
784  * \param[in]  flags         Force flags
785  * \param[in]  pmeFlags      PME flags
786  * \param[in]  wcycle        The wallcycle structure
787  */
788 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
789                                       matrix          box,
790                                       rvec            x[],
791                                       int             flags,
792                                       int             pmeFlags,
793                                       gmx_wallcycle_t wcycle)
794 {
795     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
796     pme_gpu_launch_spread(pmedata, x, wcycle);
797 }
798
799 /*! \brief Launch the FFT and gather stages of PME GPU
800  *
801  * This function only implements setting the output forces (no accumulation).
802  *
803  * \param[in]  pmedata        The PME structure
804  * \param[in]  wcycle         The wallcycle structure
805  */
806 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
807                                      gmx_wallcycle_t   wcycle)
808 {
809     pme_gpu_launch_complex_transforms(pmedata, wcycle);
810     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
811 }
812
813 /*! \brief
814  *  Polling wait for either of the PME or nonbonded GPU tasks.
815  *
816  * Instead of a static order in waiting for GPU tasks, this function
817  * polls checking which of the two tasks completes first, and does the
818  * associated force buffer reduction overlapped with the other task.
819  * By doing that, unlike static scheduling order, it can always overlap
820  * one of the reductions, regardless of the GPU task completion order.
821  *
822  * \param[in]     nbv              Nonbonded verlet structure
823  * \param[in,out] pmedata          PME module data
824  * \param[in,out] force            Force array to reduce task outputs into.
825  * \param[in,out] forceWithVirial  Force and virial buffers
826  * \param[in,out] fshift           Shift force output vector results are reduced into
827  * \param[in,out] enerd            Energy data structure results are reduced into
828  * \param[in]     flags            Force flags
829  * \param[in]     pmeFlags         PME flags
830  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
831  * \param[in]     wcycle           The wallcycle structure
832  */
833 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
834                                         gmx_pme_t                           *pmedata,
835                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
836                                         gmx::ForceWithVirial                *forceWithVirial,
837                                         rvec                                 fshift[],
838                                         gmx_enerdata_t                      *enerd,
839                                         int                                  flags,
840                                         int                                  pmeFlags,
841                                         bool                                 haveOtherWork,
842                                         gmx_wallcycle_t                      wcycle)
843 {
844     bool isPmeGpuDone = false;
845     bool isNbGpuDone  = false;
846
847
848     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
849
850     while (!isPmeGpuDone || !isNbGpuDone)
851     {
852         if (!isPmeGpuDone)
853         {
854             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
855             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
856         }
857
858         if (!isNbGpuDone)
859         {
860             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
861             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
862             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
863                                                     flags, eatLocal,
864                                                     haveOtherWork,
865                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
866                                                     fshift, completionType);
867             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
868             // To get the call count right, when the task finished we
869             // issue a start/stop.
870             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
871             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
872             if (isNbGpuDone)
873             {
874                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
875                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
876
877                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
878                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
879             }
880         }
881     }
882 }
883
884 /*! \brief
885  *  Launch the dynamic rolling pruning GPU task.
886  *
887  *  We currently alternate local/non-local list pruning in odd-even steps
888  *  (only pruning every second step without DD).
889  *
890  * \param[in]     cr               The communication record
891  * \param[in]     nbv              Nonbonded verlet structure
892  * \param[in]     inputrec         The input record
893  * \param[in]     step             The current MD step
894  */
895 static inline void launchGpuRollingPruning(const t_commrec          *cr,
896                                            const nonbonded_verlet_t *nbv,
897                                            const t_inputrec         *inputrec,
898                                            const int64_t             step)
899 {
900     /* We should not launch the rolling pruning kernel at a search
901      * step or just before search steps, since that's useless.
902      * Without domain decomposition we prune at even steps.
903      * With domain decomposition we alternate local and non-local
904      * pruning at even and odd steps.
905      */
906     int  numRollingParts     = nbv->listParams->numRollingParts;
907     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
908     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
909     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
910     if (stepWithCurrentList > 0 &&
911         stepWithCurrentList < inputrec->nstlist - 1 &&
912         (stepIsEven || DOMAINDECOMP(cr)))
913     {
914         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
915                                           stepIsEven ? eintLocal : eintNonlocal,
916                                           numRollingParts);
917     }
918 }
919
920 static void do_force_cutsVERLET(FILE *fplog,
921                                 const t_commrec *cr,
922                                 const gmx_multisim_t *ms,
923                                 const t_inputrec *inputrec,
924                                 gmx::Awh *awh,
925                                 gmx_enfrot *enforcedRotation,
926                                 int64_t step,
927                                 t_nrnb *nrnb,
928                                 gmx_wallcycle_t wcycle,
929                                 const gmx_localtop_t *top,
930                                 const gmx_groups_t * /* groups */,
931                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
932                                 history_t *hist,
933                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
934                                 tensor vir_force,
935                                 const t_mdatoms *mdatoms,
936                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
937                                 real *lambda,
938                                 t_graph *graph,
939                                 t_forcerec *fr,
940                                 gmx::PpForceWorkload *ppForceWorkload,
941                                 interaction_const_t *ic,
942                                 const gmx_vsite_t *vsite,
943                                 rvec mu_tot,
944                                 double t,
945                                 gmx_edsam *ed,
946                                 const int flags,
947                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
948                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
949 {
950     int                 cg1, i, j;
951     double              mu[2*DIM];
952     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
953     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
954     rvec                vzero, box_diag;
955     float               cycles_pme, cycles_wait_gpu;
956     nonbonded_verlet_t *nbv = fr->nbv;
957
958     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
959     bNS           = ((flags & GMX_FORCE_NS) != 0);
960     bFillGrid     = (bNS && bStateChanged);
961     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
962     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
963     bUseGPU       = fr->nbv->bUseGPU;
964     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
965
966     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
967     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
968     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
969         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
970     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
971         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
972         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
973         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
974
975     /* At a search step we need to start the first balancing region
976      * somewhere early inside the step after communication during domain
977      * decomposition (and not during the previous step as usual).
978      */
979     if (bNS &&
980         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
981     {
982         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
983     }
984
985     cycles_wait_gpu = 0;
986
987     const int start  = 0;
988     const int homenr = mdatoms->homenr;
989
990     clear_mat(vir_force);
991
992     if (DOMAINDECOMP(cr))
993     {
994         cg1 = cr->dd->globalAtomGroupIndices.size();
995     }
996     else
997     {
998         cg1 = top->cgs.nr;
999     }
1000     if (fr->n_tpi > 0)
1001     {
1002         cg1--;
1003     }
1004
1005     if (bStateChanged)
1006     {
1007         update_forcerec(fr, box);
1008
1009         if (inputrecNeedMutot(inputrec))
1010         {
1011             /* Calculate total (local) dipole moment in a temporary common array.
1012              * This makes it possible to sum them over nodes faster.
1013              */
1014             calc_mu(start, homenr,
1015                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1016                     mu, mu+DIM);
1017         }
1018     }
1019
1020     if (fr->ePBC != epbcNONE)
1021     {
1022         /* Compute shift vectors every step,
1023          * because of pressure coupling or box deformation!
1024          */
1025         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1026         {
1027             calc_shifts(box, fr->shift_vec);
1028         }
1029
1030         if (bCalcCGCM)
1031         {
1032             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1033             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1034         }
1035         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1036         {
1037             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1038         }
1039     }
1040
1041     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1042                                  fr->shift_vec, nbv->nbat);
1043
1044 #if GMX_MPI
1045     if (!thisRankHasDuty(cr, DUTY_PME))
1046     {
1047         /* Send particle coordinates to the pme nodes.
1048          * Since this is only implemented for domain decomposition
1049          * and domain decomposition does not use the graph,
1050          * we do not need to worry about shifting.
1051          */
1052         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1053                                  lambda[efptCOUL], lambda[efptVDW],
1054                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1055                                  step, wcycle);
1056     }
1057 #endif /* GMX_MPI */
1058
1059     if (useGpuPme)
1060     {
1061         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1062     }
1063
1064     /* do gridding for pair search */
1065     if (bNS)
1066     {
1067         if (graph && bStateChanged)
1068         {
1069             /* Calculate intramolecular shift vectors to make molecules whole */
1070             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1071         }
1072
1073         clear_rvec(vzero);
1074         box_diag[XX] = box[XX][XX];
1075         box_diag[YY] = box[YY][YY];
1076         box_diag[ZZ] = box[ZZ][ZZ];
1077
1078         wallcycle_start(wcycle, ewcNS);
1079         if (!DOMAINDECOMP(cr))
1080         {
1081             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1082             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1083                               0, vzero, box_diag,
1084                               nullptr, 0, mdatoms->homenr, -1,
1085                               fr->cginfo, x.unpaddedArrayRef(),
1086                               0, nullptr,
1087                               nbv->grp[eintLocal].kernel_type,
1088                               nbv->nbat);
1089             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1090         }
1091         else
1092         {
1093             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1094             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1095                                        fr->cginfo, x.unpaddedArrayRef(),
1096                                        nbv->grp[eintNonlocal].kernel_type,
1097                                        nbv->nbat);
1098             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1099         }
1100
1101         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1102
1103         wallcycle_stop(wcycle, ewcNS);
1104     }
1105
1106     /* initialize the GPU atom data and copy shift vector */
1107     if (bUseGPU)
1108     {
1109         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1110         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1111
1112         if (bNS)
1113         {
1114             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1115         }
1116
1117         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1118
1119         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1120
1121         if (bNS && fr->gpuBonded)
1122         {
1123             /* Now we put all atoms on the grid, we can assign bonded
1124              * interactions to the GPU, where the grid order is
1125              * needed. Also the xq, f and fshift device buffers have
1126              * been reallocated if needed, so the bonded code can
1127              * learn about them. */
1128             // TODO the xq, f, and fshift buffers are now shared
1129             // resources, so they should be maintained by a
1130             // higher-level object than the nb module.
1131             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1132                                                                   top->idef,
1133                                                                   nbnxn_gpu_get_xq(nbv->gpu_nbv),
1134                                                                   nbnxn_gpu_get_f(nbv->gpu_nbv),
1135                                                                   nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1136             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1137         }
1138
1139         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1140     }
1141
1142     /* do local pair search */
1143     if (bNS)
1144     {
1145         wallcycle_start_nocount(wcycle, ewcNS);
1146         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1147         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1148                             &top->excls,
1149                             nbv->listParams->rlistOuter,
1150                             nbv->min_ci_balanced,
1151                             &nbv->grp[eintLocal].nbl_lists,
1152                             eintLocal,
1153                             nbv->grp[eintLocal].kernel_type,
1154                             nrnb);
1155         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1156         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1157         {
1158             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1159         }
1160         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1161
1162         if (bUseGPU)
1163         {
1164             /* initialize local pair-list on the GPU */
1165             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1166                                     nbv->grp[eintLocal].nbl_lists.nblGpu[0],
1167                                     eintLocal);
1168         }
1169         wallcycle_stop(wcycle, ewcNS);
1170     }
1171     else
1172     {
1173         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1174                                         nbv->nbat, wcycle);
1175     }
1176
1177     if (bUseGPU)
1178     {
1179         if (DOMAINDECOMP(cr))
1180         {
1181             ddOpenBalanceRegionGpu(cr->dd);
1182         }
1183
1184         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1185
1186         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1187         nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
1188         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1189
1190         // bonded work not split into separate local and non-local, so with DD
1191         // we can only launch the kernel after non-local coordinates have been received.
1192         if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1193         {
1194             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1195             fr->gpuBonded->launchKernels(fr, flags, box);
1196             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1197         }
1198
1199         /* launch local nonbonded work on GPU */
1200         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1201         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1202                      step, nrnb, wcycle);
1203         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1204         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1205     }
1206
1207     if (useGpuPme)
1208     {
1209         // In PME GPU and mixed mode we launch FFT / gather after the
1210         // X copy/transform to allow overlap as well as after the GPU NB
1211         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1212         // the nonbonded kernel.
1213         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1214     }
1215
1216     /* Communicate coordinates and sum dipole if necessary +
1217        do non-local pair search */
1218     if (DOMAINDECOMP(cr))
1219     {
1220         if (bNS)
1221         {
1222             wallcycle_start_nocount(wcycle, ewcNS);
1223             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1224
1225             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1226                                 &top->excls,
1227                                 nbv->listParams->rlistOuter,
1228                                 nbv->min_ci_balanced,
1229                                 &nbv->grp[eintNonlocal].nbl_lists,
1230                                 eintNonlocal,
1231                                 nbv->grp[eintNonlocal].kernel_type,
1232                                 nrnb);
1233             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1234             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1235             {
1236                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1237             }
1238             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1239
1240             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1241             {
1242                 /* initialize non-local pair-list on the GPU */
1243                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1244                                         nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
1245                                         eintNonlocal);
1246             }
1247             wallcycle_stop(wcycle, ewcNS);
1248         }
1249         else
1250         {
1251             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1252
1253             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1254                                             nbv->nbat, wcycle);
1255         }
1256
1257         if (bUseGPU)
1258         {
1259             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1260
1261             /* launch non-local nonbonded tasks on GPU */
1262             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1263             nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1264             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1265
1266             if (ppForceWorkload->haveGpuBondedWork)
1267             {
1268                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1269                 fr->gpuBonded->launchKernels(fr, flags, box);
1270                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1271             }
1272
1273             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1274             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1275                          step, nrnb, wcycle);
1276             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1277
1278             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1279         }
1280     }
1281
1282     if (bUseGPU)
1283     {
1284         /* launch D2H copy-back F */
1285         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1286         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1287         if (DOMAINDECOMP(cr))
1288         {
1289             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1290                                      flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1291         }
1292         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1293                                  flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1294         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1295
1296         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1297         {
1298             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1299             fr->gpuBonded->launchEnergyTransfer();
1300             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1301         }
1302         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1303     }
1304
1305     if (bStateChanged && inputrecNeedMutot(inputrec))
1306     {
1307         if (PAR(cr))
1308         {
1309             gmx_sumd(2*DIM, mu, cr);
1310             ddReopenBalanceRegionCpu(cr->dd);
1311         }
1312
1313         for (i = 0; i < 2; i++)
1314         {
1315             for (j = 0; j < DIM; j++)
1316             {
1317                 fr->mu_tot[i][j] = mu[i*DIM + j];
1318             }
1319         }
1320     }
1321     if (fr->efep == efepNO)
1322     {
1323         copy_rvec(fr->mu_tot[0], mu_tot);
1324     }
1325     else
1326     {
1327         for (j = 0; j < DIM; j++)
1328         {
1329             mu_tot[j] =
1330                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1331                 lambda[efptCOUL]*fr->mu_tot[1][j];
1332         }
1333     }
1334
1335     /* Reset energies */
1336     reset_enerdata(enerd);
1337     clear_rvecs(SHIFTS, fr->fshift);
1338
1339     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1340     {
1341         wallcycle_start(wcycle, ewcPPDURINGPME);
1342         dd_force_flop_start(cr->dd, nrnb);
1343     }
1344
1345     if (inputrec->bRot)
1346     {
1347         wallcycle_start(wcycle, ewcROT);
1348         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1349         wallcycle_stop(wcycle, ewcROT);
1350     }
1351
1352     /* Temporary solution until all routines take PaddedRVecVector */
1353     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1354
1355     /* Start the force cycle counter.
1356      * Note that a different counter is used for dynamic load balancing.
1357      */
1358     wallcycle_start(wcycle, ewcFORCE);
1359
1360     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1361     if (bDoForces)
1362     {
1363         /* If we need to compute the virial, we might need a separate
1364          * force buffer for algorithms for which the virial is calculated
1365          * directly, such as PME.
1366          */
1367         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1368         {
1369             forceRef = *fr->forceBufferForDirectVirialContributions;
1370
1371             /* TODO: update comment
1372              * We only compute forces on local atoms. Note that vsites can
1373              * spread to non-local atoms, but that part of the buffer is
1374              * cleared separately in the vsite spreading code.
1375              */
1376             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1377         }
1378         /* Clear the short- and long-range forces */
1379         clear_rvecs_omp(fr->natoms_force_constr, f);
1380     }
1381
1382     /* forceWithVirial uses the local atom range only */
1383     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1384
1385     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1386     {
1387         clear_pull_forces(inputrec->pull_work);
1388     }
1389
1390     /* We calculate the non-bonded forces, when done on the CPU, here.
1391      * We do this before calling do_force_lowlevel, because in that
1392      * function, the listed forces are calculated before PME, which
1393      * does communication.  With this order, non-bonded and listed
1394      * force calculation imbalance can be balanced out by the domain
1395      * decomposition load balancing.
1396      */
1397
1398     if (!bUseOrEmulGPU)
1399     {
1400         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1401                      step, nrnb, wcycle);
1402     }
1403
1404     if (fr->efep != efepNO)
1405     {
1406         /* Calculate the local and non-local free energy interactions here.
1407          * Happens here on the CPU both with and without GPU.
1408          */
1409         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1410         {
1411             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1412                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1413                              inputrec->fepvals, lambda,
1414                              enerd, flags, nrnb, wcycle);
1415         }
1416
1417         if (DOMAINDECOMP(cr) &&
1418             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1419         {
1420             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1421                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1422                              inputrec->fepvals, lambda,
1423                              enerd, flags, nrnb, wcycle);
1424         }
1425     }
1426
1427     if (!bUseOrEmulGPU)
1428     {
1429         int aloc;
1430
1431         if (DOMAINDECOMP(cr))
1432         {
1433             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1434                          step, nrnb, wcycle);
1435         }
1436
1437         if (!bUseOrEmulGPU)
1438         {
1439             aloc = eintLocal;
1440         }
1441         else
1442         {
1443             aloc = eintNonlocal;
1444         }
1445
1446         /* Add all the non-bonded force to the normal force array.
1447          * This can be split into a local and a non-local part when overlapping
1448          * communication with calculation with domain decomposition.
1449          */
1450         wallcycle_stop(wcycle, ewcFORCE);
1451
1452         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1453
1454         wallcycle_start_nocount(wcycle, ewcFORCE);
1455
1456         /* if there are multiple fshift output buffers reduce them */
1457         if ((flags & GMX_FORCE_VIRIAL) &&
1458             nbv->grp[aloc].nbl_lists.nnbl > 1)
1459         {
1460             /* This is not in a subcounter because it takes a
1461                negligible and constant-sized amount of time */
1462             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1463                                                      fr->fshift);
1464         }
1465     }
1466
1467     /* update QMMMrec, if necessary */
1468     if (fr->bQMMM)
1469     {
1470         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1471     }
1472
1473     /* Compute the bonded and non-bonded energies and optionally forces */
1474     do_force_lowlevel(fr, inputrec, &(top->idef),
1475                       cr, ms, nrnb, wcycle, mdatoms,
1476                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1477                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1478                       flags, &cycles_pme);
1479
1480     wallcycle_stop(wcycle, ewcFORCE);
1481
1482     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1483                          step, t, wcycle,
1484                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1485                          flags, &forceWithVirial, enerd,
1486                          ed, bNS);
1487
1488     if (bUseOrEmulGPU)
1489     {
1490         /* wait for non-local forces (or calculate in emulation mode) */
1491         if (DOMAINDECOMP(cr))
1492         {
1493             if (bUseGPU)
1494             {
1495                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1496                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1497                                            flags, eatNonlocal,
1498                                            ppForceWorkload->haveGpuBondedWork,
1499                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1500                                            fr->fshift);
1501                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1502             }
1503             else
1504             {
1505                 wallcycle_start_nocount(wcycle, ewcFORCE);
1506                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1507                              step, nrnb, wcycle);
1508                 wallcycle_stop(wcycle, ewcFORCE);
1509             }
1510
1511             /* skip the reduction if there was no non-local work to do */
1512             if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
1513             {
1514                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1515                                                nbv->nbat, f, wcycle);
1516             }
1517         }
1518     }
1519
1520     if (DOMAINDECOMP(cr))
1521     {
1522         /* We are done with the CPU compute.
1523          * We will now communicate the non-local forces.
1524          * If we use a GPU this will overlap with GPU work, so in that case
1525          * we do not close the DD force balancing region here.
1526          */
1527         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1528         {
1529             ddCloseBalanceRegionCpu(cr->dd);
1530         }
1531         if (bDoForces)
1532         {
1533             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1534         }
1535     }
1536
1537     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1538     // an alternating wait/reduction scheme.
1539     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1540     if (alternateGpuWait)
1541     {
1542         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1543     }
1544
1545     if (!alternateGpuWait && useGpuPme)
1546     {
1547         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1548     }
1549
1550     /* Wait for local GPU NB outputs on the non-alternating wait path */
1551     if (!alternateGpuWait && bUseGPU)
1552     {
1553         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1554          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1555          * but even with a step of 0.1 ms the difference is less than 1%
1556          * of the step time.
1557          */
1558         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1559
1560         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1561         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1562                                    flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1563                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1564                                    fr->fshift);
1565         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1566
1567         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1568         {
1569             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1570             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1571             {
1572                 /* We measured few cycles, it could be that the kernel
1573                  * and transfer finished earlier and there was no actual
1574                  * wait time, only API call overhead.
1575                  * Then the actual time could be anywhere between 0 and
1576                  * cycles_wait_est. We will use half of cycles_wait_est.
1577                  */
1578                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1579             }
1580             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1581         }
1582     }
1583
1584     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1585     {
1586         // NOTE: emulation kernel is not included in the balancing region,
1587         // but emulation mode does not target performance anyway
1588         wallcycle_start_nocount(wcycle, ewcFORCE);
1589         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1590                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1591                      step, nrnb, wcycle);
1592         wallcycle_stop(wcycle, ewcFORCE);
1593     }
1594
1595     if (useGpuPme)
1596     {
1597         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1598     }
1599
1600     if (bUseGPU)
1601     {
1602         /* now clear the GPU outputs while we finish the step on the CPU */
1603         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1604         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1605         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1606
1607         /* Is dynamic pair-list pruning activated? */
1608         if (nbv->listParams->useDynamicPruning)
1609         {
1610             launchGpuRollingPruning(cr, nbv, inputrec, step);
1611         }
1612         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1613         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1614     }
1615
1616     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1617     {
1618         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1619         // in principle this should be included in the DD balancing region,
1620         // but generally it is infrequent so we'll omit it for the sake of
1621         // simpler code
1622         fr->gpuBonded->accumulateEnergyTerms(enerd);
1623         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1624
1625         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1626         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1627         fr->gpuBonded->clearEnergies();
1628         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1629         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1630     }
1631
1632     /* Do the nonbonded GPU (or emulation) force buffer reduction
1633      * on the non-alternating path. */
1634     if (bUseOrEmulGPU && !alternateGpuWait)
1635     {
1636         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1637                                        nbv->nbat, f, wcycle);
1638     }
1639     if (DOMAINDECOMP(cr))
1640     {
1641         dd_force_flop_stop(cr->dd, nrnb);
1642     }
1643
1644     if (bDoForces)
1645     {
1646         /* If we have NoVirSum forces, but we do not calculate the virial,
1647          * we sum fr->f_novirsum=f later.
1648          */
1649         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1650         {
1651             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1652                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1653         }
1654
1655         if (flags & GMX_FORCE_VIRIAL)
1656         {
1657             /* Calculation of the virial must be done after vsites! */
1658             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1659                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1660         }
1661     }
1662
1663     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1664     {
1665         /* In case of node-splitting, the PP nodes receive the long-range
1666          * forces, virial and energy from the PME nodes here.
1667          */
1668         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1669     }
1670
1671     if (bDoForces)
1672     {
1673         post_process_forces(cr, step, nrnb, wcycle,
1674                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1675                             vir_force, mdatoms, graph, fr, vsite,
1676                             flags);
1677     }
1678
1679     if (flags & GMX_FORCE_ENERGY)
1680     {
1681         /* Sum the potential energy terms from group contributions */
1682         sum_epot(&(enerd->grpp), enerd->term);
1683
1684         if (!EI_TPI(inputrec->eI))
1685         {
1686             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1687         }
1688     }
1689 }
1690
1691 static void do_force_cutsGROUP(FILE *fplog,
1692                                const t_commrec *cr,
1693                                const gmx_multisim_t *ms,
1694                                const t_inputrec *inputrec,
1695                                gmx::Awh *awh,
1696                                gmx_enfrot *enforcedRotation,
1697                                int64_t step,
1698                                t_nrnb *nrnb,
1699                                gmx_wallcycle_t wcycle,
1700                                gmx_localtop_t *top,
1701                                const gmx_groups_t *groups,
1702                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1703                                history_t *hist,
1704                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1705                                tensor vir_force,
1706                                const t_mdatoms *mdatoms,
1707                                gmx_enerdata_t *enerd,
1708                                t_fcdata *fcd,
1709                                real *lambda,
1710                                t_graph *graph,
1711                                t_forcerec *fr,
1712                                const gmx_vsite_t *vsite,
1713                                rvec mu_tot,
1714                                double t,
1715                                gmx_edsam *ed,
1716                                int flags,
1717                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1718                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1719 {
1720     int        cg0, cg1, i, j;
1721     double     mu[2*DIM];
1722     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1723     gmx_bool   bDoForces;
1724     float      cycles_pme;
1725
1726     const int  start  = 0;
1727     const int  homenr = mdatoms->homenr;
1728
1729     clear_mat(vir_force);
1730
1731     cg0 = 0;
1732     if (DOMAINDECOMP(cr))
1733     {
1734         cg1 = cr->dd->globalAtomGroupIndices.size();
1735     }
1736     else
1737     {
1738         cg1 = top->cgs.nr;
1739     }
1740     if (fr->n_tpi > 0)
1741     {
1742         cg1--;
1743     }
1744
1745     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1746     bNS            = ((flags & GMX_FORCE_NS) != 0);
1747     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1748     bFillGrid      = (bNS && bStateChanged);
1749     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1750     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1751
1752     if (bStateChanged)
1753     {
1754         update_forcerec(fr, box);
1755
1756         if (inputrecNeedMutot(inputrec))
1757         {
1758             /* Calculate total (local) dipole moment in a temporary common array.
1759              * This makes it possible to sum them over nodes faster.
1760              */
1761             calc_mu(start, homenr,
1762                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1763                     mu, mu+DIM);
1764         }
1765     }
1766
1767     if (fr->ePBC != epbcNONE)
1768     {
1769         /* Compute shift vectors every step,
1770          * because of pressure coupling or box deformation!
1771          */
1772         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1773         {
1774             calc_shifts(box, fr->shift_vec);
1775         }
1776
1777         if (bCalcCGCM)
1778         {
1779             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1780                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1781             inc_nrnb(nrnb, eNR_CGCM, homenr);
1782             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1783         }
1784         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1785         {
1786             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1787         }
1788     }
1789     else if (bCalcCGCM)
1790     {
1791         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1792         inc_nrnb(nrnb, eNR_CGCM, homenr);
1793     }
1794
1795     if (bCalcCGCM && gmx_debug_at)
1796     {
1797         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1798     }
1799
1800 #if GMX_MPI
1801     if (!thisRankHasDuty(cr, DUTY_PME))
1802     {
1803         /* Send particle coordinates to the pme nodes.
1804          * Since this is only implemented for domain decomposition
1805          * and domain decomposition does not use the graph,
1806          * we do not need to worry about shifting.
1807          */
1808         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1809                                  lambda[efptCOUL], lambda[efptVDW],
1810                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1811                                  step, wcycle);
1812     }
1813 #endif /* GMX_MPI */
1814
1815     /* Communicate coordinates and sum dipole if necessary */
1816     if (DOMAINDECOMP(cr))
1817     {
1818         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1819
1820         /* No GPU support, no move_x overlap, so reopen the balance region here */
1821         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1822         {
1823             ddReopenBalanceRegionCpu(cr->dd);
1824         }
1825     }
1826
1827     if (inputrecNeedMutot(inputrec))
1828     {
1829         if (bStateChanged)
1830         {
1831             if (PAR(cr))
1832             {
1833                 gmx_sumd(2*DIM, mu, cr);
1834                 ddReopenBalanceRegionCpu(cr->dd);
1835             }
1836             for (i = 0; i < 2; i++)
1837             {
1838                 for (j = 0; j < DIM; j++)
1839                 {
1840                     fr->mu_tot[i][j] = mu[i*DIM + j];
1841                 }
1842             }
1843         }
1844         if (fr->efep == efepNO)
1845         {
1846             copy_rvec(fr->mu_tot[0], mu_tot);
1847         }
1848         else
1849         {
1850             for (j = 0; j < DIM; j++)
1851             {
1852                 mu_tot[j] =
1853                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1854             }
1855         }
1856     }
1857
1858     /* Reset energies */
1859     reset_enerdata(enerd);
1860     clear_rvecs(SHIFTS, fr->fshift);
1861
1862     if (bNS)
1863     {
1864         wallcycle_start(wcycle, ewcNS);
1865
1866         if (graph && bStateChanged)
1867         {
1868             /* Calculate intramolecular shift vectors to make molecules whole */
1869             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1870         }
1871
1872         /* Do the actual neighbour searching */
1873         ns(fplog, fr, box,
1874            groups, top, mdatoms,
1875            cr, nrnb, bFillGrid);
1876
1877         wallcycle_stop(wcycle, ewcNS);
1878     }
1879
1880     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1881     {
1882         wallcycle_start(wcycle, ewcPPDURINGPME);
1883         dd_force_flop_start(cr->dd, nrnb);
1884     }
1885
1886     if (inputrec->bRot)
1887     {
1888         wallcycle_start(wcycle, ewcROT);
1889         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1890         wallcycle_stop(wcycle, ewcROT);
1891     }
1892
1893     /* Temporary solution until all routines take PaddedRVecVector */
1894     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1895
1896     /* Start the force cycle counter.
1897      * Note that a different counter is used for dynamic load balancing.
1898      */
1899     wallcycle_start(wcycle, ewcFORCE);
1900
1901     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1902     if (bDoForces)
1903     {
1904         /* If we need to compute the virial, we might need a separate
1905          * force buffer for algorithms for which the virial is calculated
1906          * separately, such as PME.
1907          */
1908         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1909         {
1910             forceRef = *fr->forceBufferForDirectVirialContributions;
1911
1912             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1913         }
1914
1915         /* Clear the short- and long-range forces */
1916         clear_rvecs(fr->natoms_force_constr, f);
1917     }
1918
1919     /* forceWithVirial might need the full force atom range */
1920     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1921
1922     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1923     {
1924         clear_pull_forces(inputrec->pull_work);
1925     }
1926
1927     /* update QMMMrec, if necessary */
1928     if (fr->bQMMM)
1929     {
1930         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1931     }
1932
1933     /* Compute the bonded and non-bonded energies and optionally forces */
1934     do_force_lowlevel(fr, inputrec, &(top->idef),
1935                       cr, ms, nrnb, wcycle, mdatoms,
1936                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1937                       box, inputrec->fepvals, lambda,
1938                       graph, &(top->excls), fr->mu_tot,
1939                       flags,
1940                       &cycles_pme);
1941
1942     wallcycle_stop(wcycle, ewcFORCE);
1943
1944     if (DOMAINDECOMP(cr))
1945     {
1946         dd_force_flop_stop(cr->dd, nrnb);
1947
1948         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1949         {
1950             ddCloseBalanceRegionCpu(cr->dd);
1951         }
1952     }
1953
1954     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1955                          step, t, wcycle,
1956                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1957                          flags, &forceWithVirial, enerd,
1958                          ed, bNS);
1959
1960     if (bDoForces)
1961     {
1962         /* Communicate the forces */
1963         if (DOMAINDECOMP(cr))
1964         {
1965             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1966             /* Do we need to communicate the separate force array
1967              * for terms that do not contribute to the single sum virial?
1968              * Position restraints and electric fields do not introduce
1969              * inter-cg forces, only full electrostatics methods do.
1970              * When we do not calculate the virial, fr->f_novirsum = f,
1971              * so we have already communicated these forces.
1972              */
1973             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1974                 (flags & GMX_FORCE_VIRIAL))
1975             {
1976                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1977             }
1978         }
1979
1980         /* If we have NoVirSum forces, but we do not calculate the virial,
1981          * we sum fr->f_novirsum=f later.
1982          */
1983         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1984         {
1985             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1986                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1987         }
1988
1989         if (flags & GMX_FORCE_VIRIAL)
1990         {
1991             /* Calculation of the virial must be done after vsites! */
1992             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1993                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1994         }
1995     }
1996
1997     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1998     {
1999         /* In case of node-splitting, the PP nodes receive the long-range
2000          * forces, virial and energy from the PME nodes here.
2001          */
2002         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2003     }
2004
2005     if (bDoForces)
2006     {
2007         post_process_forces(cr, step, nrnb, wcycle,
2008                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2009                             vir_force, mdatoms, graph, fr, vsite,
2010                             flags);
2011     }
2012
2013     if (flags & GMX_FORCE_ENERGY)
2014     {
2015         /* Sum the potential energy terms from group contributions */
2016         sum_epot(&(enerd->grpp), enerd->term);
2017
2018         if (!EI_TPI(inputrec->eI))
2019         {
2020             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2021         }
2022     }
2023
2024 }
2025
2026 void do_force(FILE                                     *fplog,
2027               const t_commrec                          *cr,
2028               const gmx_multisim_t                     *ms,
2029               const t_inputrec                         *inputrec,
2030               gmx::Awh                                 *awh,
2031               gmx_enfrot                               *enforcedRotation,
2032               int64_t                                   step,
2033               t_nrnb                                   *nrnb,
2034               gmx_wallcycle_t                           wcycle,
2035               gmx_localtop_t                           *top,
2036               const gmx_groups_t                       *groups,
2037               matrix                                    box,
2038               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
2039               history_t                                *hist,
2040               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
2041               tensor                                    vir_force,
2042               const t_mdatoms                          *mdatoms,
2043               gmx_enerdata_t                           *enerd,
2044               t_fcdata                                 *fcd,
2045               gmx::ArrayRef<real>                       lambda,
2046               t_graph                                  *graph,
2047               t_forcerec                               *fr,
2048               gmx::PpForceWorkload                     *ppForceWorkload,
2049               const gmx_vsite_t                        *vsite,
2050               rvec                                      mu_tot,
2051               double                                    t,
2052               gmx_edsam                                *ed,
2053               int                                       flags,
2054               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2055               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2056 {
2057     /* modify force flag if not doing nonbonded */
2058     if (!fr->bNonbonded)
2059     {
2060         flags &= ~GMX_FORCE_NONBONDED;
2061     }
2062
2063     switch (inputrec->cutoff_scheme)
2064     {
2065         case ecutsVERLET:
2066             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2067                                 awh, enforcedRotation, step, nrnb, wcycle,
2068                                 top,
2069                                 groups,
2070                                 box, x, hist,
2071                                 force, vir_force,
2072                                 mdatoms,
2073                                 enerd, fcd,
2074                                 lambda.data(), graph,
2075                                 fr,
2076                                 ppForceWorkload,
2077                                 fr->ic,
2078                                 vsite, mu_tot,
2079                                 t, ed,
2080                                 flags,
2081                                 ddOpenBalanceRegion,
2082                                 ddCloseBalanceRegion);
2083             break;
2084         case ecutsGROUP:
2085             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2086                                awh, enforcedRotation, step, nrnb, wcycle,
2087                                top,
2088                                groups,
2089                                box, x, hist,
2090                                force, vir_force,
2091                                mdatoms,
2092                                enerd, fcd,
2093                                lambda.data(), graph,
2094                                fr, vsite, mu_tot,
2095                                t, ed,
2096                                flags,
2097                                ddOpenBalanceRegion,
2098                                ddCloseBalanceRegion);
2099             break;
2100         default:
2101             gmx_incons("Invalid cut-off scheme passed!");
2102     }
2103
2104     /* In case we don't have constraints and are using GPUs, the next balancing
2105      * region starts here.
2106      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2107      * virial calculation and COM pulling, is not thus not included in
2108      * the balance timing, which is ok as most tasks do communication.
2109      */
2110     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2111     {
2112         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2113     }
2114 }
2115
2116
2117 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2118                         const t_inputrec *ir, const t_mdatoms *md,
2119                         t_state *state)
2120 {
2121     int             i, m, start, end;
2122     int64_t         step;
2123     real            dt = ir->delta_t;
2124     real            dvdl_dum;
2125     rvec           *savex;
2126
2127     /* We need to allocate one element extra, since we might use
2128      * (unaligned) 4-wide SIMD loads to access rvec entries.
2129      */
2130     snew(savex, state->natoms + 1);
2131
2132     start = 0;
2133     end   = md->homenr;
2134
2135     if (debug)
2136     {
2137         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2138                 start, md->homenr, end);
2139     }
2140     /* Do a first constrain to reset particles... */
2141     step = ir->init_step;
2142     if (fplog)
2143     {
2144         char buf[STEPSTRSIZE];
2145         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2146                 gmx_step_str(step, buf));
2147     }
2148     dvdl_dum = 0;
2149
2150     /* constrain the current position */
2151     constr->apply(TRUE, FALSE,
2152                   step, 0, 1.0,
2153                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2154                   state->box,
2155                   state->lambda[efptBONDED], &dvdl_dum,
2156                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2157     if (EI_VV(ir->eI))
2158     {
2159         /* constrain the inital velocity, and save it */
2160         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2161         constr->apply(TRUE, FALSE,
2162                       step, 0, 1.0,
2163                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2164                       state->box,
2165                       state->lambda[efptBONDED], &dvdl_dum,
2166                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2167     }
2168     /* constrain the inital velocities at t-dt/2 */
2169     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2170     {
2171         auto x = makeArrayRef(state->x).subArray(start, end);
2172         auto v = makeArrayRef(state->v).subArray(start, end);
2173         for (i = start; (i < end); i++)
2174         {
2175             for (m = 0; (m < DIM); m++)
2176             {
2177                 /* Reverse the velocity */
2178                 v[i][m] = -v[i][m];
2179                 /* Store the position at t-dt in buf */
2180                 savex[i][m] = x[i][m] + dt*v[i][m];
2181             }
2182         }
2183         /* Shake the positions at t=-dt with the positions at t=0
2184          * as reference coordinates.
2185          */
2186         if (fplog)
2187         {
2188             char buf[STEPSTRSIZE];
2189             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2190                     gmx_step_str(step, buf));
2191         }
2192         dvdl_dum = 0;
2193         constr->apply(TRUE, FALSE,
2194                       step, -1, 1.0,
2195                       state->x.rvec_array(), savex, nullptr,
2196                       state->box,
2197                       state->lambda[efptBONDED], &dvdl_dum,
2198                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2199
2200         for (i = start; i < end; i++)
2201         {
2202             for (m = 0; m < DIM; m++)
2203             {
2204                 /* Re-reverse the velocities */
2205                 v[i][m] = -v[i][m];
2206             }
2207         }
2208     }
2209     sfree(savex);
2210 }
2211
2212
2213 static void
2214 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2215                 double *enerout, double *virout)
2216 {
2217     double enersum, virsum;
2218     double invscale, invscale2, invscale3;
2219     double r, ea, eb, ec, pa, pb, pc, pd;
2220     double y0, f, g, h;
2221     int    ri, offset;
2222     double tabfactor;
2223
2224     invscale  = 1.0/scale;
2225     invscale2 = invscale*invscale;
2226     invscale3 = invscale*invscale2;
2227
2228     /* Following summation derived from cubic spline definition,
2229      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2230      * the cubic spline.  We first calculate the negative of the
2231      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2232      * add the more standard, abrupt cutoff correction to that result,
2233      * yielding the long-range correction for a switched function.  We
2234      * perform both the pressure and energy loops at the same time for
2235      * simplicity, as the computational cost is low. */
2236
2237     if (offstart == 0)
2238     {
2239         /* Since the dispersion table has been scaled down a factor
2240          * 6.0 and the repulsion a factor 12.0 to compensate for the
2241          * c6/c12 parameters inside nbfp[] being scaled up (to save
2242          * flops in kernels), we need to correct for this.
2243          */
2244         tabfactor = 6.0;
2245     }
2246     else
2247     {
2248         tabfactor = 12.0;
2249     }
2250
2251     enersum = 0.0;
2252     virsum  = 0.0;
2253     for (ri = rstart; ri < rend; ++ri)
2254     {
2255         r  = ri*invscale;
2256         ea = invscale3;
2257         eb = 2.0*invscale2*r;
2258         ec = invscale*r*r;
2259
2260         pa = invscale3;
2261         pb = 3.0*invscale2*r;
2262         pc = 3.0*invscale*r*r;
2263         pd = r*r*r;
2264
2265         /* this "8" is from the packing in the vdwtab array - perhaps
2266            should be defined? */
2267
2268         offset = 8*ri + offstart;
2269         y0     = vdwtab[offset];
2270         f      = vdwtab[offset+1];
2271         g      = vdwtab[offset+2];
2272         h      = vdwtab[offset+3];
2273
2274         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2275         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2276     }
2277     *enerout = 4.0*M_PI*enersum*tabfactor;
2278     *virout  = 4.0*M_PI*virsum*tabfactor;
2279 }
2280
2281 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2282 {
2283     double   eners[2], virs[2], enersum, virsum;
2284     double   r0, rc3, rc9;
2285     int      ri0, ri1, i;
2286     real     scale, *vdwtab;
2287
2288     fr->enershiftsix    = 0;
2289     fr->enershifttwelve = 0;
2290     fr->enerdiffsix     = 0;
2291     fr->enerdifftwelve  = 0;
2292     fr->virdiffsix      = 0;
2293     fr->virdifftwelve   = 0;
2294
2295     const interaction_const_t *ic = fr->ic;
2296
2297     if (eDispCorr != edispcNO)
2298     {
2299         for (i = 0; i < 2; i++)
2300         {
2301             eners[i] = 0;
2302             virs[i]  = 0;
2303         }
2304         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2305             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2306             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2307             (ic->vdwtype == evdwSHIFT) ||
2308             (ic->vdwtype == evdwSWITCH))
2309         {
2310             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2311                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2312                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2313             {
2314                 gmx_fatal(FARGS,
2315                           "With dispersion correction rvdw-switch can not be zero "
2316                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2317             }
2318
2319             /* TODO This code depends on the logic in tables.c that
2320                constructs the table layout, which should be made
2321                explicit in future cleanup. */
2322             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2323                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2324             scale  = fr->dispersionCorrectionTable->scale;
2325             vdwtab = fr->dispersionCorrectionTable->data;
2326
2327             /* Round the cut-offs to exact table values for precision */
2328             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2329             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2330
2331             /* The code below has some support for handling force-switching, i.e.
2332              * when the force (instead of potential) is switched over a limited
2333              * region. This leads to a constant shift in the potential inside the
2334              * switching region, which we can handle by adding a constant energy
2335              * term in the force-switch case just like when we do potential-shift.
2336              *
2337              * For now this is not enabled, but to keep the functionality in the
2338              * code we check separately for switch and shift. When we do force-switch
2339              * the shifting point is rvdw_switch, while it is the cutoff when we
2340              * have a classical potential-shift.
2341              *
2342              * For a pure potential-shift the potential has a constant shift
2343              * all the way out to the cutoff, and that is it. For other forms
2344              * we need to calculate the constant shift up to the point where we
2345              * start modifying the potential.
2346              */
2347             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2348
2349             r0   = ri0/scale;
2350             rc3  = r0*r0*r0;
2351             rc9  = rc3*rc3*rc3;
2352
2353             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2354                 (ic->vdwtype == evdwSHIFT))
2355             {
2356                 /* Determine the constant energy shift below rvdw_switch.
2357                  * Table has a scale factor since we have scaled it down to compensate
2358                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2359                  */
2360                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2361                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2362             }
2363             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2364             {
2365                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2366                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2367             }
2368
2369             /* Add the constant part from 0 to rvdw_switch.
2370              * This integration from 0 to rvdw_switch overcounts the number
2371              * of interactions by 1, as it also counts the self interaction.
2372              * We will correct for this later.
2373              */
2374             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2375             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2376
2377             /* Calculate the contribution in the range [r0,r1] where we
2378              * modify the potential. For a pure potential-shift modifier we will
2379              * have ri0==ri1, and there will not be any contribution here.
2380              */
2381             for (i = 0; i < 2; i++)
2382             {
2383                 enersum = 0;
2384                 virsum  = 0;
2385                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2386                 eners[i] -= enersum;
2387                 virs[i]  -= virsum;
2388             }
2389
2390             /* Alright: Above we compensated by REMOVING the parts outside r0
2391              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2392              *
2393              * Regardless of whether r0 is the point where we start switching,
2394              * or the cutoff where we calculated the constant shift, we include
2395              * all the parts we are missing out to infinity from r0 by
2396              * calculating the analytical dispersion correction.
2397              */
2398             eners[0] += -4.0*M_PI/(3.0*rc3);
2399             eners[1] +=  4.0*M_PI/(9.0*rc9);
2400             virs[0]  +=  8.0*M_PI/rc3;
2401             virs[1]  += -16.0*M_PI/(3.0*rc9);
2402         }
2403         else if (ic->vdwtype == evdwCUT ||
2404                  EVDW_PME(ic->vdwtype) ||
2405                  ic->vdwtype == evdwUSER)
2406         {
2407             if (ic->vdwtype == evdwUSER && fplog)
2408             {
2409                 fprintf(fplog,
2410                         "WARNING: using dispersion correction with user tables\n");
2411             }
2412
2413             /* Note that with LJ-PME, the dispersion correction is multiplied
2414              * by the difference between the actual C6 and the value of C6
2415              * that would produce the combination rule.
2416              * This means the normal energy and virial difference formulas
2417              * can be used here.
2418              */
2419
2420             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2421             rc9  = rc3*rc3*rc3;
2422             /* Contribution beyond the cut-off */
2423             eners[0] += -4.0*M_PI/(3.0*rc3);
2424             eners[1] +=  4.0*M_PI/(9.0*rc9);
2425             if (ic->vdw_modifier == eintmodPOTSHIFT)
2426             {
2427                 /* Contribution within the cut-off */
2428                 eners[0] += -4.0*M_PI/(3.0*rc3);
2429                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2430             }
2431             /* Contribution beyond the cut-off */
2432             virs[0]  +=  8.0*M_PI/rc3;
2433             virs[1]  += -16.0*M_PI/(3.0*rc9);
2434         }
2435         else
2436         {
2437             gmx_fatal(FARGS,
2438                       "Dispersion correction is not implemented for vdw-type = %s",
2439                       evdw_names[ic->vdwtype]);
2440         }
2441
2442         /* When we deprecate the group kernels the code below can go too */
2443         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2444         {
2445             /* Calculate self-interaction coefficient (assuming that
2446              * the reciprocal-space contribution is constant in the
2447              * region that contributes to the self-interaction).
2448              */
2449             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2450
2451             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2452             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2453         }
2454
2455         fr->enerdiffsix    = eners[0];
2456         fr->enerdifftwelve = eners[1];
2457         /* The 0.5 is due to the Gromacs definition of the virial */
2458         fr->virdiffsix     = 0.5*virs[0];
2459         fr->virdifftwelve  = 0.5*virs[1];
2460     }
2461 }
2462
2463 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2464                    const matrix box, real lambda, tensor pres, tensor virial,
2465                    real *prescorr, real *enercorr, real *dvdlcorr)
2466 {
2467     gmx_bool bCorrAll, bCorrPres;
2468     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2469     int      m;
2470
2471     *prescorr = 0;
2472     *enercorr = 0;
2473     *dvdlcorr = 0;
2474
2475     clear_mat(virial);
2476     clear_mat(pres);
2477
2478     if (ir->eDispCorr != edispcNO)
2479     {
2480         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2481                      ir->eDispCorr == edispcAllEnerPres);
2482         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2483                      ir->eDispCorr == edispcAllEnerPres);
2484
2485         invvol = 1/det(box);
2486         if (fr->n_tpi)
2487         {
2488             /* Only correct for the interactions with the inserted molecule */
2489             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2490             ninter = fr->n_tpi;
2491         }
2492         else
2493         {
2494             dens   = fr->numAtomsForDispersionCorrection*invvol;
2495             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2496         }
2497
2498         if (ir->efep == efepNO)
2499         {
2500             avcsix    = fr->avcsix[0];
2501             avctwelve = fr->avctwelve[0];
2502         }
2503         else
2504         {
2505             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2506             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2507         }
2508
2509         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2510         *enercorr += avcsix*enerdiff;
2511         dvdlambda  = 0.0;
2512         if (ir->efep != efepNO)
2513         {
2514             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2515         }
2516         if (bCorrAll)
2517         {
2518             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2519             *enercorr += avctwelve*enerdiff;
2520             if (fr->efep != efepNO)
2521             {
2522                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2523             }
2524         }
2525
2526         if (bCorrPres)
2527         {
2528             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2529             if (ir->eDispCorr == edispcAllEnerPres)
2530             {
2531                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2532             }
2533             /* The factor 2 is because of the Gromacs virial definition */
2534             spres = -2.0*invvol*svir*PRESFAC;
2535
2536             for (m = 0; m < DIM; m++)
2537             {
2538                 virial[m][m] += svir;
2539                 pres[m][m]   += spres;
2540             }
2541             *prescorr += spres;
2542         }
2543
2544         /* Can't currently control when it prints, for now, just print when degugging */
2545         if (debug)
2546         {
2547             if (bCorrAll)
2548             {
2549                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2550                         avcsix, avctwelve);
2551             }
2552             if (bCorrPres)
2553             {
2554                 fprintf(debug,
2555                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2556                         *enercorr, spres, svir);
2557             }
2558             else
2559             {
2560                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2561             }
2562         }
2563
2564         if (fr->efep != efepNO)
2565         {
2566             *dvdlcorr += dvdlambda;
2567         }
2568     }
2569 }
2570
2571 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2572                             const gmx_mtop_t *mtop, rvec x[],
2573                             gmx_bool bFirst)
2574 {
2575     t_graph        *graph;
2576     int             as, mol;
2577
2578     if (bFirst && fplog)
2579     {
2580         fprintf(fplog, "Removing pbc first time\n");
2581     }
2582
2583     snew(graph, 1);
2584     as = 0;
2585     for (const gmx_molblock_t &molb : mtop->molblock)
2586     {
2587         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2588         if (moltype.atoms.nr == 1 ||
2589             (!bFirst && moltype.cgs.nr == 1))
2590         {
2591             /* Just one atom or charge group in the molecule, no PBC required */
2592             as += molb.nmol*moltype.atoms.nr;
2593         }
2594         else
2595         {
2596             mk_graph_moltype(moltype, graph);
2597
2598             for (mol = 0; mol < molb.nmol; mol++)
2599             {
2600                 mk_mshift(fplog, graph, ePBC, box, x+as);
2601
2602                 shift_self(graph, box, x+as);
2603                 /* The molecule is whole now.
2604                  * We don't need the second mk_mshift call as in do_pbc_first,
2605                  * since we no longer need this graph.
2606                  */
2607
2608                 as += moltype.atoms.nr;
2609             }
2610             done_graph(graph);
2611         }
2612     }
2613     sfree(graph);
2614 }
2615
2616 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2617                        const gmx_mtop_t *mtop, rvec x[])
2618 {
2619     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2620 }
2621
2622 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2623                  const gmx_mtop_t *mtop, rvec x[])
2624 {
2625     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2626 }
2627
2628 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2629 {
2630     int t, nth;
2631     nth = gmx_omp_nthreads_get(emntDefault);
2632
2633 #pragma omp parallel for num_threads(nth) schedule(static)
2634     for (t = 0; t < nth; t++)
2635     {
2636         try
2637         {
2638             size_t natoms = x.size();
2639             size_t offset = (natoms*t    )/nth;
2640             size_t len    = (natoms*(t + 1))/nth - offset;
2641             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2642         }
2643         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2644     }
2645 }
2646
2647 // TODO This can be cleaned up a lot, and move back to runner.cpp
2648 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2649                 const t_inputrec *inputrec,
2650                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2651                 gmx_walltime_accounting_t walltime_accounting,
2652                 nonbonded_verlet_t *nbv,
2653                 const gmx_pme_t *pme,
2654                 gmx_bool bWriteStat)
2655 {
2656     t_nrnb *nrnb_tot = nullptr;
2657     double  delta_t  = 0;
2658     double  nbfs     = 0, mflop = 0;
2659     double  elapsed_time,
2660             elapsed_time_over_all_ranks,
2661             elapsed_time_over_all_threads,
2662             elapsed_time_over_all_threads_over_all_ranks;
2663     /* Control whether it is valid to print a report. Only the
2664        simulation master may print, but it should not do so if the run
2665        terminated e.g. before a scheduled reset step. This is
2666        complicated by the fact that PME ranks are unaware of the
2667        reason why they were sent a pmerecvqxFINISH. To avoid
2668        communication deadlocks, we always do the communication for the
2669        report, even if we've decided not to write the report, because
2670        how long it takes to finish the run is not important when we've
2671        decided not to report on the simulation performance.
2672
2673        Further, we only report performance for dynamical integrators,
2674        because those are the only ones for which we plan to
2675        consider doing any optimizations. */
2676     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2677
2678     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2679     {
2680         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2681         printReport = false;
2682     }
2683
2684     if (cr->nnodes > 1)
2685     {
2686         snew(nrnb_tot, 1);
2687 #if GMX_MPI
2688         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2689                       cr->mpi_comm_mysim);
2690 #endif
2691     }
2692     else
2693     {
2694         nrnb_tot = nrnb;
2695     }
2696
2697     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2698     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2699     if (cr->nnodes > 1)
2700     {
2701 #if GMX_MPI
2702         /* reduce elapsed_time over all MPI ranks in the current simulation */
2703         MPI_Allreduce(&elapsed_time,
2704                       &elapsed_time_over_all_ranks,
2705                       1, MPI_DOUBLE, MPI_SUM,
2706                       cr->mpi_comm_mysim);
2707         elapsed_time_over_all_ranks /= cr->nnodes;
2708         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2709          * current simulation. */
2710         MPI_Allreduce(&elapsed_time_over_all_threads,
2711                       &elapsed_time_over_all_threads_over_all_ranks,
2712                       1, MPI_DOUBLE, MPI_SUM,
2713                       cr->mpi_comm_mysim);
2714 #endif
2715     }
2716     else
2717     {
2718         elapsed_time_over_all_ranks                  = elapsed_time;
2719         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2720     }
2721
2722     if (printReport)
2723     {
2724         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2725     }
2726     if (cr->nnodes > 1)
2727     {
2728         sfree(nrnb_tot);
2729     }
2730
2731     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2732     {
2733         print_dd_statistics(cr, inputrec, fplog);
2734     }
2735
2736     /* TODO Move the responsibility for any scaling by thread counts
2737      * to the code that handled the thread region, so that there's a
2738      * mechanism to keep cycle counting working during the transition
2739      * to task parallelism. */
2740     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2741     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2742     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2743     auto cycle_sum(wallcycle_sum(cr, wcycle));
2744
2745     if (printReport)
2746     {
2747         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2748         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2749         if (pme_gpu_task_enabled(pme))
2750         {
2751             pme_gpu_get_timings(pme, &pme_gpu_timings);
2752         }
2753         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2754                         elapsed_time_over_all_ranks,
2755                         wcycle, cycle_sum,
2756                         nbnxn_gpu_timings,
2757                         &pme_gpu_timings);
2758
2759         if (EI_DYNAMICS(inputrec->eI))
2760         {
2761             delta_t = inputrec->delta_t;
2762         }
2763
2764         if (fplog)
2765         {
2766             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2767                        elapsed_time_over_all_ranks,
2768                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2769                        delta_t, nbfs, mflop);
2770         }
2771         if (bWriteStat)
2772         {
2773             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2774                        elapsed_time_over_all_ranks,
2775                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2776                        delta_t, nbfs, mflop);
2777         }
2778     }
2779 }
2780
2781 void initialize_lambdas(FILE               *fplog,
2782                         const t_inputrec   &ir,
2783                         bool                isMaster,
2784                         int                *fep_state,
2785                         gmx::ArrayRef<real> lambda,
2786                         double             *lam0)
2787 {
2788     /* TODO: Clean up initialization of fep_state and lambda in
2789        t_state.  This function works, but could probably use a logic
2790        rewrite to keep all the different types of efep straight. */
2791
2792     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2793     {
2794         return;
2795     }
2796
2797     const t_lambda *fep = ir.fepvals;
2798     if (isMaster)
2799     {
2800         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2801                                              if checkpoint is set -- a kludge is in for now
2802                                              to prevent this.*/
2803     }
2804
2805     for (int i = 0; i < efptNR; i++)
2806     {
2807         double thisLambda;
2808         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2809         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2810         {
2811             thisLambda = fep->init_lambda;
2812         }
2813         else
2814         {
2815             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2816         }
2817         if (isMaster)
2818         {
2819             lambda[i] = thisLambda;
2820         }
2821         if (lam0 != nullptr)
2822         {
2823             lam0[i] = thisLambda;
2824         }
2825     }
2826     if (ir.bSimTemp)
2827     {
2828         /* need to rescale control temperatures to match current state */
2829         for (int i = 0; i < ir.opts.ngtc; i++)
2830         {
2831             if (ir.opts.ref_t[i] > 0)
2832             {
2833                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2834             }
2835         }
2836     }
2837
2838     /* Send to the log the information on the current lambdas */
2839     if (fplog != nullptr)
2840     {
2841         fprintf(fplog, "Initial vector of lambda components:[ ");
2842         for (const auto &l : lambda)
2843         {
2844             fprintf(fplog, "%10.4f ", l);
2845         }
2846         fprintf(fplog, "]\n");
2847     }
2848 }