Use enum class for nbnxm locality
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <cmath>
44 #include <cstdint>
45 #include <cstdio>
46 #include <cstring>
47
48 #include <array>
49
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/domdec/partition.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed_forces/bonded.h"
66 #include "gromacs/listed_forces/disre.h"
67 #include "gromacs/listed_forces/gpubonded.h"
68 #include "gromacs/listed_forces/manage_threading.h"
69 #include "gromacs/listed_forces/orires.h"
70 #include "gromacs/math/arrayrefwithpadding.h"
71 #include "gromacs/math/functions.h"
72 #include "gromacs/math/units.h"
73 #include "gromacs/math/vec.h"
74 #include "gromacs/math/vecdump.h"
75 #include "gromacs/mdlib/calcmu.h"
76 #include "gromacs/mdlib/calcvir.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/forcerec.h"
80 #include "gromacs/mdlib/gmx_omp_nthreads.h"
81 #include "gromacs/mdlib/mdrun.h"
82 #include "gromacs/mdlib/ppforceworkload.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/nbnxm/atomdata.h"
93 #include "gromacs/nbnxm/gpu_data_mgmt.h"
94 #include "gromacs/nbnxm/nbnxm.h"
95 #include "gromacs/pbcutil/ishift.h"
96 #include "gromacs/pbcutil/mshift.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/pulling/pull.h"
99 #include "gromacs/pulling/pull_rotation.h"
100 #include "gromacs/timing/cyclecounter.h"
101 #include "gromacs/timing/gpu_timing.h"
102 #include "gromacs/timing/wallcycle.h"
103 #include "gromacs/timing/wallcyclereporting.h"
104 #include "gromacs/timing/walltime_accounting.h"
105 #include "gromacs/topology/topology.h"
106 #include "gromacs/utility/arrayref.h"
107 #include "gromacs/utility/basedefinitions.h"
108 #include "gromacs/utility/cstringutil.h"
109 #include "gromacs/utility/exceptions.h"
110 #include "gromacs/utility/fatalerror.h"
111 #include "gromacs/utility/gmxassert.h"
112 #include "gromacs/utility/gmxmpi.h"
113 #include "gromacs/utility/logger.h"
114 #include "gromacs/utility/smalloc.h"
115 #include "gromacs/utility/strconvert.h"
116 #include "gromacs/utility/sysinfo.h"
117
118 // TODO: this environment variable allows us to verify before release
119 // that on less common architectures the total cost of polling is not larger than
120 // a blocking wait (so polling does not introduce overhead when the static
121 // PME-first ordering would suffice).
122 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
123
124
125 void print_time(FILE                     *out,
126                 gmx_walltime_accounting_t walltime_accounting,
127                 int64_t                   step,
128                 const t_inputrec         *ir,
129                 const t_commrec          *cr)
130 {
131     time_t finish;
132     double dt, elapsed_seconds, time_per_step;
133
134 #if !GMX_THREAD_MPI
135     if (!PAR(cr))
136 #endif
137     {
138         fprintf(out, "\r");
139     }
140     fputs("step ", out);
141     fputs(gmx::int64ToString(step).c_str(), out);
142     fflush(out);
143
144     if ((step >= ir->nstlist))
145     {
146         double seconds_since_epoch = gmx_gettime();
147         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
148         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
149         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
150
151         if (ir->nsteps >= 0)
152         {
153             if (dt >= 300)
154             {
155                 finish = static_cast<time_t>(seconds_since_epoch + dt);
156                 auto timebuf = gmx_ctime_r(&finish);
157                 timebuf.erase(timebuf.find_first_of('\n'));
158                 fputs(", will finish ", out);
159                 fputs(timebuf.c_str(), out);
160             }
161             else
162             {
163                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
164             }
165         }
166         else
167         {
168             fprintf(out, " performance: %.1f ns/day    ",
169                     ir->delta_t/1000*24*60*60/time_per_step);
170         }
171     }
172 #if !GMX_THREAD_MPI
173     if (PAR(cr))
174     {
175         fprintf(out, "\n");
176     }
177 #else
178     GMX_UNUSED_VALUE(cr);
179 #endif
180
181     fflush(out);
182 }
183
184 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
185                          double the_time)
186 {
187     if (!fplog)
188     {
189         return;
190     }
191
192     time_t temp_time = static_cast<time_t>(the_time);
193
194     auto   timebuf = gmx_ctime_r(&temp_time);
195
196     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
197 }
198
199 void print_start(FILE *fplog, const t_commrec *cr,
200                  gmx_walltime_accounting_t walltime_accounting,
201                  const char *name)
202 {
203     char buf[STRLEN];
204
205     sprintf(buf, "Started %s", name);
206     print_date_and_time(fplog, cr->nodeid, buf,
207                         walltime_accounting_get_start_time_stamp(walltime_accounting));
208 }
209
210 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
211 {
212     const int      end = forceToAdd.size();
213
214     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
215 #pragma omp parallel for num_threads(nt) schedule(static)
216     for (int i = 0; i < end; i++)
217     {
218         rvec_inc(f[i], forceToAdd[i]);
219     }
220 }
221
222 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
223                         tensor vir_part, const t_graph *graph, const matrix box,
224                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
225 {
226     /* The short-range virial from surrounding boxes */
227     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
228     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
229
230     /* Calculate partial virial, for local atoms only, based on short range.
231      * Total virial is computed in global_stat, called from do_md
232      */
233     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
234     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
235
236     if (debug)
237     {
238         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
239     }
240 }
241
242 static void pull_potential_wrapper(const t_commrec *cr,
243                                    const t_inputrec *ir,
244                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
245                                    gmx::ForceWithVirial *force,
246                                    const t_mdatoms *mdatoms,
247                                    gmx_enerdata_t *enerd,
248                                    const real *lambda,
249                                    double t,
250                                    gmx_wallcycle_t wcycle)
251 {
252     t_pbc  pbc;
253     real   dvdl;
254
255     /* Calculate the center of mass forces, this requires communication,
256      * which is why pull_potential is called close to other communication.
257      */
258     wallcycle_start(wcycle, ewcPULLPOT);
259     set_pbc(&pbc, ir->ePBC, box);
260     dvdl                     = 0;
261     enerd->term[F_COM_PULL] +=
262         pull_potential(ir->pull_work, mdatoms, &pbc,
263                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
264     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
265     wallcycle_stop(wcycle, ewcPULLPOT);
266 }
267
268 static void pme_receive_force_ener(const t_commrec      *cr,
269                                    gmx::ForceWithVirial *forceWithVirial,
270                                    gmx_enerdata_t       *enerd,
271                                    gmx_wallcycle_t       wcycle)
272 {
273     real   e_q, e_lj, dvdl_q, dvdl_lj;
274     float  cycles_ppdpme, cycles_seppme;
275
276     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
277     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
278
279     /* In case of node-splitting, the PP nodes receive the long-range
280      * forces, virial and energy from the PME nodes here.
281      */
282     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
283     dvdl_q  = 0;
284     dvdl_lj = 0;
285     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
286                       &cycles_seppme);
287     enerd->term[F_COUL_RECIP] += e_q;
288     enerd->term[F_LJ_RECIP]   += e_lj;
289     enerd->dvdl_lin[efptCOUL] += dvdl_q;
290     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
291
292     if (wcycle)
293     {
294         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
295     }
296     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
297 }
298
299 static void print_large_forces(FILE            *fp,
300                                const t_mdatoms *md,
301                                const t_commrec *cr,
302                                int64_t          step,
303                                real             forceTolerance,
304                                const rvec      *x,
305                                const rvec      *f)
306 {
307     real           force2Tolerance = gmx::square(forceTolerance);
308     gmx::index     numNonFinite    = 0;
309     for (int i = 0; i < md->homenr; i++)
310     {
311         real force2    = norm2(f[i]);
312         bool nonFinite = !std::isfinite(force2);
313         if (force2 >= force2Tolerance || nonFinite)
314         {
315             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
316                     step,
317                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
318         }
319         if (nonFinite)
320         {
321             numNonFinite++;
322         }
323     }
324     if (numNonFinite > 0)
325     {
326         /* Note that with MPI this fatal call on one rank might interrupt
327          * the printing on other ranks. But we can only avoid that with
328          * an expensive MPI barrier that we would need at each step.
329          */
330         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
331     }
332 }
333
334 static void post_process_forces(const t_commrec           *cr,
335                                 int64_t                    step,
336                                 t_nrnb                    *nrnb,
337                                 gmx_wallcycle_t            wcycle,
338                                 const gmx_localtop_t      *top,
339                                 const matrix               box,
340                                 const rvec                 x[],
341                                 rvec                       f[],
342                                 gmx::ForceWithVirial      *forceWithVirial,
343                                 tensor                     vir_force,
344                                 const t_mdatoms           *mdatoms,
345                                 const t_graph             *graph,
346                                 const t_forcerec          *fr,
347                                 const gmx_vsite_t         *vsite,
348                                 int                        flags)
349 {
350     if (fr->haveDirectVirialContributions)
351     {
352         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
353
354         if (vsite)
355         {
356             /* Spread the mesh force on virtual sites to the other particles...
357              * This is parallellized. MPI communication is performed
358              * if the constructing atoms aren't local.
359              */
360             matrix virial = { { 0 } };
361             spread_vsite_f(vsite, x, fDirectVir, nullptr,
362                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
363                            nrnb,
364                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
365             forceWithVirial->addVirialContribution(virial);
366         }
367
368         if (flags & GMX_FORCE_VIRIAL)
369         {
370             /* Now add the forces, this is local */
371             sum_forces(f, forceWithVirial->force_);
372
373             /* Add the direct virial contributions */
374             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
375             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
376
377             if (debug)
378             {
379                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
380             }
381         }
382     }
383
384     if (fr->print_force >= 0)
385     {
386         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
387     }
388 }
389
390 static void do_nb_verlet(t_forcerec                       *fr,
391                          const interaction_const_t        *ic,
392                          gmx_enerdata_t                   *enerd,
393                          const int                         flags,
394                          const Nbnxm::InteractionLocality  ilocality,
395                          const int                         clearF,
396                          const int64_t                     step,
397                          t_nrnb                           *nrnb,
398                          gmx_wallcycle_t                   wcycle)
399 {
400     if (!(flags & GMX_FORCE_NONBONDED))
401     {
402         /* skip non-bonded calculation */
403         return;
404     }
405
406     nonbonded_verlet_t       *nbv  = fr->nbv;
407     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
408
409     /* GPU kernel launch overhead is already timed separately */
410     if (fr->cutoff_scheme != ecutsVERLET)
411     {
412         gmx_incons("Invalid cut-off scheme passed!");
413     }
414
415     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
416
417     if (!bUsingGpuKernels)
418     {
419         /* When dynamic pair-list  pruning is requested, we need to prune
420          * at nstlistPrune steps.
421          */
422         if (nbv->listParams->useDynamicPruning &&
423             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
424         {
425             /* Prune the pair-list beyond fr->ic->rlistPrune using
426              * the current coordinates of the atoms.
427              */
428             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
429             NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
430             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
431         }
432
433         wallcycle_sub_start(wcycle, ewcsNONBONDED);
434     }
435
436     NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
437
438     if (!bUsingGpuKernels)
439     {
440         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
441     }
442 }
443
444 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
445                              t_forcerec           *fr,
446                              rvec                  x[],
447                              rvec                  f[],
448                              const t_mdatoms      *mdatoms,
449                              t_lambda             *fepvals,
450                              real                 *lambda,
451                              gmx_enerdata_t       *enerd,
452                              int                   flags,
453                              t_nrnb               *nrnb,
454                              gmx_wallcycle_t       wcycle)
455 {
456     int              donb_flags;
457     nb_kernel_data_t kernel_data;
458     real             lam_i[efptNR];
459     real             dvdl_nb[efptNR];
460     int              th;
461     int              i, j;
462
463     donb_flags = 0;
464     /* Add short-range interactions */
465     donb_flags |= GMX_NONBONDED_DO_SR;
466
467     /* Currently all group scheme kernels always calculate (shift-)forces */
468     if (flags & GMX_FORCE_FORCES)
469     {
470         donb_flags |= GMX_NONBONDED_DO_FORCE;
471     }
472     if (flags & GMX_FORCE_VIRIAL)
473     {
474         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
475     }
476     if (flags & GMX_FORCE_ENERGY)
477     {
478         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
479     }
480
481     kernel_data.flags  = donb_flags;
482     kernel_data.lambda = lambda;
483     kernel_data.dvdl   = dvdl_nb;
484
485     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
486     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
487
488     /* reset free energy components */
489     for (i = 0; i < efptNR; i++)
490     {
491         dvdl_nb[i]  = 0;
492     }
493
494     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
495
496     wallcycle_sub_start(wcycle, ewcsNONBONDED);
497 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
498     for (th = 0; th < nbl_lists->nnbl; th++)
499     {
500         try
501         {
502             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
503                                       x, f, fr, mdatoms, &kernel_data, nrnb);
504         }
505         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
506     }
507
508     if (fepvals->sc_alpha != 0)
509     {
510         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
511         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
512     }
513     else
514     {
515         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
516         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
517     }
518
519     /* If we do foreign lambda and we have soft-core interactions
520      * we have to recalculate the (non-linear) energies contributions.
521      */
522     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
523     {
524         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
525         kernel_data.lambda         = lam_i;
526         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
527         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
528         /* Note that we add to kernel_data.dvdl, but ignore the result */
529
530         for (i = 0; i < enerd->n_lambda; i++)
531         {
532             for (j = 0; j < efptNR; j++)
533             {
534                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
535             }
536             reset_foreign_enerdata(enerd);
537 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
538             for (th = 0; th < nbl_lists->nnbl; th++)
539             {
540                 try
541                 {
542                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
543                                               x, f, fr, mdatoms, &kernel_data, nrnb);
544                 }
545                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
546             }
547
548             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
549             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
550         }
551     }
552
553     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
554 }
555
556 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
557 {
558     return nbv != nullptr && nbv->bUseGPU;
559 }
560
561 static inline void clear_rvecs_omp(int n, rvec v[])
562 {
563     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
564
565     /* Note that we would like to avoid this conditional by putting it
566      * into the omp pragma instead, but then we still take the full
567      * omp parallel for overhead (at least with gcc5).
568      */
569     if (nth == 1)
570     {
571         for (int i = 0; i < n; i++)
572         {
573             clear_rvec(v[i]);
574         }
575     }
576     else
577     {
578 #pragma omp parallel for num_threads(nth) schedule(static)
579         for (int i = 0; i < n; i++)
580         {
581             clear_rvec(v[i]);
582         }
583     }
584 }
585
586 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
587  *
588  * \param groupOptions  Group options, containing T-coupling options
589  */
590 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
591 {
592     real nrdfCoupled   = 0;
593     real nrdfUncoupled = 0;
594     real kineticEnergy = 0;
595     for (int g = 0; g < groupOptions.ngtc; g++)
596     {
597         if (groupOptions.tau_t[g] >= 0)
598         {
599             nrdfCoupled   += groupOptions.nrdf[g];
600             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
601         }
602         else
603         {
604             nrdfUncoupled += groupOptions.nrdf[g];
605         }
606     }
607
608     /* This conditional with > also catches nrdf=0 */
609     if (nrdfCoupled > nrdfUncoupled)
610     {
611         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
612     }
613     else
614     {
615         return 0;
616     }
617 }
618
619 /*! \brief This routine checks that the potential energy is finite.
620  *
621  * Always checks that the potential energy is finite. If step equals
622  * inputrec.init_step also checks that the magnitude of the potential energy
623  * is reasonable. Terminates with a fatal error when a check fails.
624  * Note that passing this check does not guarantee finite forces,
625  * since those use slightly different arithmetics. But in most cases
626  * there is just a narrow coordinate range where forces are not finite
627  * and energies are finite.
628  *
629  * \param[in] step      The step number, used for checking and printing
630  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
631  * \param[in] inputrec  The input record
632  */
633 static void checkPotentialEnergyValidity(int64_t               step,
634                                          const gmx_enerdata_t &enerd,
635                                          const t_inputrec     &inputrec)
636 {
637     /* Threshold valid for comparing absolute potential energy against
638      * the kinetic energy. Normally one should not consider absolute
639      * potential energy values, but with a factor of one million
640      * we should never get false positives.
641      */
642     constexpr real c_thresholdFactor = 1e6;
643
644     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
645     real           averageKineticEnergy = 0;
646     /* We only check for large potential energy at the initial step,
647      * because that is by far the most likely step for this too occur
648      * and because computing the average kinetic energy is not free.
649      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
650      * before they become NaN.
651      */
652     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
653     {
654         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
655     }
656
657     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
658                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
659     {
660         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
661                   step,
662                   enerd.term[F_EPOT],
663                   energyIsNotFinite ? "not finite" : "extremely high",
664                   enerd.term[F_LJ],
665                   enerd.term[F_COUL_SR],
666                   energyIsNotFinite ? "non-finite" : "very high",
667                   energyIsNotFinite ? " or Nan" : "");
668     }
669 }
670
671 /*! \brief Compute forces and/or energies for special algorithms
672  *
673  * The intention is to collect all calls to algorithms that compute
674  * forces on local atoms only and that do not contribute to the local
675  * virial sum (but add their virial contribution separately).
676  * Eventually these should likely all become ForceProviders.
677  * Within this function the intention is to have algorithms that do
678  * global communication at the end, so global barriers within the MD loop
679  * are as close together as possible.
680  *
681  * \param[in]     fplog            The log file
682  * \param[in]     cr               The communication record
683  * \param[in]     inputrec         The input record
684  * \param[in]     awh              The Awh module (nullptr if none in use).
685  * \param[in]     enforcedRotation Enforced rotation module.
686  * \param[in]     step             The current MD step
687  * \param[in]     t                The current time
688  * \param[in,out] wcycle           Wallcycle accounting struct
689  * \param[in,out] forceProviders   Pointer to a list of force providers
690  * \param[in]     box              The unit cell
691  * \param[in]     x                The coordinates
692  * \param[in]     mdatoms          Per atom properties
693  * \param[in]     lambda           Array of free-energy lambda values
694  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
695  * \param[in,out] forceWithVirial  Force and virial buffers
696  * \param[in,out] enerd            Energy buffer
697  * \param[in,out] ed               Essential dynamics pointer
698  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
699  *
700  * \todo Remove bNS, which is used incorrectly.
701  * \todo Convert all other algorithms called here to ForceProviders.
702  */
703 static void
704 computeSpecialForces(FILE                          *fplog,
705                      const t_commrec               *cr,
706                      const t_inputrec              *inputrec,
707                      gmx::Awh                      *awh,
708                      gmx_enfrot                    *enforcedRotation,
709                      int64_t                        step,
710                      double                         t,
711                      gmx_wallcycle_t                wcycle,
712                      ForceProviders                *forceProviders,
713                      matrix                         box,
714                      gmx::ArrayRef<const gmx::RVec> x,
715                      const t_mdatoms               *mdatoms,
716                      real                          *lambda,
717                      int                            forceFlags,
718                      gmx::ForceWithVirial          *forceWithVirial,
719                      gmx_enerdata_t                *enerd,
720                      gmx_edsam                     *ed,
721                      gmx_bool                       bNS)
722 {
723     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
724
725     /* NOTE: Currently all ForceProviders only provide forces.
726      *       When they also provide energies, remove this conditional.
727      */
728     if (computeForces)
729     {
730         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
731         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
732
733         /* Collect forces from modules */
734         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
735     }
736
737     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
738     {
739         pull_potential_wrapper(cr, inputrec, box, x,
740                                forceWithVirial,
741                                mdatoms, enerd, lambda, t,
742                                wcycle);
743
744         if (awh)
745         {
746             enerd->term[F_COM_PULL] +=
747                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
748                                                   forceWithVirial,
749                                                   t, step, wcycle, fplog);
750         }
751     }
752
753     rvec *f = as_rvec_array(forceWithVirial->force_.data());
754
755     /* Add the forces from enforced rotation potentials (if any) */
756     if (inputrec->bRot)
757     {
758         wallcycle_start(wcycle, ewcROTadd);
759         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
760         wallcycle_stop(wcycle, ewcROTadd);
761     }
762
763     if (ed)
764     {
765         /* Note that since init_edsam() is called after the initialization
766          * of forcerec, edsam doesn't request the noVirSum force buffer.
767          * Thus if no other algorithm (e.g. PME) requires it, the forces
768          * here will contribute to the virial.
769          */
770         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
771     }
772
773     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
774     if (inputrec->bIMD && computeForces)
775     {
776         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
777     }
778 }
779
780 /*! \brief Launch the prepare_step and spread stages of PME GPU.
781  *
782  * \param[in]  pmedata       The PME structure
783  * \param[in]  box           The box matrix
784  * \param[in]  x             Coordinate array
785  * \param[in]  flags         Force flags
786  * \param[in]  pmeFlags      PME flags
787  * \param[in]  wcycle        The wallcycle structure
788  */
789 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
790                                       matrix          box,
791                                       rvec            x[],
792                                       int             flags,
793                                       int             pmeFlags,
794                                       gmx_wallcycle_t wcycle)
795 {
796     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
797     pme_gpu_launch_spread(pmedata, x, wcycle);
798 }
799
800 /*! \brief Launch the FFT and gather stages of PME GPU
801  *
802  * This function only implements setting the output forces (no accumulation).
803  *
804  * \param[in]  pmedata        The PME structure
805  * \param[in]  wcycle         The wallcycle structure
806  */
807 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
808                                      gmx_wallcycle_t   wcycle)
809 {
810     pme_gpu_launch_complex_transforms(pmedata, wcycle);
811     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
812 }
813
814 /*! \brief
815  *  Polling wait for either of the PME or nonbonded GPU tasks.
816  *
817  * Instead of a static order in waiting for GPU tasks, this function
818  * polls checking which of the two tasks completes first, and does the
819  * associated force buffer reduction overlapped with the other task.
820  * By doing that, unlike static scheduling order, it can always overlap
821  * one of the reductions, regardless of the GPU task completion order.
822  *
823  * \param[in]     nbv              Nonbonded verlet structure
824  * \param[in,out] pmedata          PME module data
825  * \param[in,out] force            Force array to reduce task outputs into.
826  * \param[in,out] forceWithVirial  Force and virial buffers
827  * \param[in,out] fshift           Shift force output vector results are reduced into
828  * \param[in,out] enerd            Energy data structure results are reduced into
829  * \param[in]     flags            Force flags
830  * \param[in]     pmeFlags         PME flags
831  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
832  * \param[in]     wcycle           The wallcycle structure
833  */
834 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
835                                         gmx_pme_t                           *pmedata,
836                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
837                                         gmx::ForceWithVirial                *forceWithVirial,
838                                         rvec                                 fshift[],
839                                         gmx_enerdata_t                      *enerd,
840                                         int                                  flags,
841                                         int                                  pmeFlags,
842                                         bool                                 haveOtherWork,
843                                         gmx_wallcycle_t                      wcycle)
844 {
845     bool isPmeGpuDone = false;
846     bool isNbGpuDone  = false;
847
848
849     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
850
851     while (!isPmeGpuDone || !isNbGpuDone)
852     {
853         if (!isPmeGpuDone)
854         {
855             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
856             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
857         }
858
859         if (!isNbGpuDone)
860         {
861             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
862             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
863             isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
864                                                      flags,
865                                                      Nbnxm::AtomLocality::Local,
866                                                      haveOtherWork,
867                                                      enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
868                                                      fshift, completionType);
869             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
870             // To get the call count right, when the task finished we
871             // issue a start/stop.
872             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
873             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
874             if (isNbGpuDone)
875             {
876                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
877                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
878
879                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
880                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
881             }
882         }
883     }
884 }
885
886 /*! \brief
887  *  Launch the dynamic rolling pruning GPU task.
888  *
889  *  We currently alternate local/non-local list pruning in odd-even steps
890  *  (only pruning every second step without DD).
891  *
892  * \param[in]     cr               The communication record
893  * \param[in]     nbv              Nonbonded verlet structure
894  * \param[in]     inputrec         The input record
895  * \param[in]     step             The current MD step
896  */
897 static inline void launchGpuRollingPruning(const t_commrec          *cr,
898                                            const nonbonded_verlet_t *nbv,
899                                            const t_inputrec         *inputrec,
900                                            const int64_t             step)
901 {
902     /* We should not launch the rolling pruning kernel at a search
903      * step or just before search steps, since that's useless.
904      * Without domain decomposition we prune at even steps.
905      * With domain decomposition we alternate local and non-local
906      * pruning at even and odd steps.
907      */
908     int  numRollingParts     = nbv->listParams->numRollingParts;
909     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
910     int  stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
911     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
912     if (stepWithCurrentList > 0 &&
913         stepWithCurrentList < inputrec->nstlist - 1 &&
914         (stepIsEven || DOMAINDECOMP(cr)))
915     {
916         Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
917                                            stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
918                                            numRollingParts);
919     }
920 }
921
922 static void do_force_cutsVERLET(FILE *fplog,
923                                 const t_commrec *cr,
924                                 const gmx_multisim_t *ms,
925                                 const t_inputrec *inputrec,
926                                 gmx::Awh *awh,
927                                 gmx_enfrot *enforcedRotation,
928                                 int64_t step,
929                                 t_nrnb *nrnb,
930                                 gmx_wallcycle_t wcycle,
931                                 const gmx_localtop_t *top,
932                                 const gmx_groups_t * /* groups */,
933                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
934                                 history_t *hist,
935                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
936                                 tensor vir_force,
937                                 const t_mdatoms *mdatoms,
938                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
939                                 real *lambda,
940                                 t_graph *graph,
941                                 t_forcerec *fr,
942                                 gmx::PpForceWorkload *ppForceWorkload,
943                                 interaction_const_t *ic,
944                                 const gmx_vsite_t *vsite,
945                                 rvec mu_tot,
946                                 double t,
947                                 gmx_edsam *ed,
948                                 const int flags,
949                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
950                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
951 {
952     int                 cg1, i, j;
953     double              mu[2*DIM];
954     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
955     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
956     rvec                vzero, box_diag;
957     float               cycles_pme, cycles_wait_gpu;
958     nonbonded_verlet_t *nbv = fr->nbv;
959
960     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
961     bNS           = ((flags & GMX_FORCE_NS) != 0);
962     bFillGrid     = (bNS && bStateChanged);
963     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
964     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
965     bUseGPU       = fr->nbv->bUseGPU;
966     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
967
968     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
969     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
970     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
971         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
972     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
973         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
974         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
975         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
976
977     /* At a search step we need to start the first balancing region
978      * somewhere early inside the step after communication during domain
979      * decomposition (and not during the previous step as usual).
980      */
981     if (bNS &&
982         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
983     {
984         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
985     }
986
987     cycles_wait_gpu = 0;
988
989     const int start  = 0;
990     const int homenr = mdatoms->homenr;
991
992     clear_mat(vir_force);
993
994     if (DOMAINDECOMP(cr))
995     {
996         cg1 = cr->dd->globalAtomGroupIndices.size();
997     }
998     else
999     {
1000         cg1 = top->cgs.nr;
1001     }
1002     if (fr->n_tpi > 0)
1003     {
1004         cg1--;
1005     }
1006
1007     if (bStateChanged)
1008     {
1009         update_forcerec(fr, box);
1010
1011         if (inputrecNeedMutot(inputrec))
1012         {
1013             /* Calculate total (local) dipole moment in a temporary common array.
1014              * This makes it possible to sum them over nodes faster.
1015              */
1016             calc_mu(start, homenr,
1017                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1018                     mu, mu+DIM);
1019         }
1020     }
1021
1022     if (fr->ePBC != epbcNONE)
1023     {
1024         /* Compute shift vectors every step,
1025          * because of pressure coupling or box deformation!
1026          */
1027         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1028         {
1029             calc_shifts(box, fr->shift_vec);
1030         }
1031
1032         if (bCalcCGCM)
1033         {
1034             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1035             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1036         }
1037         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1038         {
1039             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1040         }
1041     }
1042
1043     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1044                                  fr->shift_vec, nbv->nbat);
1045
1046 #if GMX_MPI
1047     if (!thisRankHasDuty(cr, DUTY_PME))
1048     {
1049         /* Send particle coordinates to the pme nodes.
1050          * Since this is only implemented for domain decomposition
1051          * and domain decomposition does not use the graph,
1052          * we do not need to worry about shifting.
1053          */
1054         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1055                                  lambda[efptCOUL], lambda[efptVDW],
1056                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1057                                  step, wcycle);
1058     }
1059 #endif /* GMX_MPI */
1060
1061     if (useGpuPme)
1062     {
1063         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1064     }
1065
1066     /* do gridding for pair search */
1067     if (bNS)
1068     {
1069         if (graph && bStateChanged)
1070         {
1071             /* Calculate intramolecular shift vectors to make molecules whole */
1072             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1073         }
1074
1075         clear_rvec(vzero);
1076         box_diag[XX] = box[XX][XX];
1077         box_diag[YY] = box[YY][YY];
1078         box_diag[ZZ] = box[ZZ][ZZ];
1079
1080         wallcycle_start(wcycle, ewcNS);
1081         if (!DOMAINDECOMP(cr))
1082         {
1083             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1084             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1085                               0, vzero, box_diag,
1086                               nullptr, 0, mdatoms->homenr, -1,
1087                               fr->cginfo, x.unpaddedArrayRef(),
1088                               0, nullptr,
1089                               nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1090                               nbv->nbat);
1091             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1092         }
1093         else
1094         {
1095             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1096             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1097                                        fr->cginfo, x.unpaddedArrayRef(),
1098                                        nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1099                                        nbv->nbat);
1100             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1101         }
1102
1103         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1104
1105         wallcycle_stop(wcycle, ewcNS);
1106     }
1107
1108     /* initialize the GPU atom data and copy shift vector */
1109     if (bUseGPU)
1110     {
1111         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1112         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1113
1114         if (bNS)
1115         {
1116             Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1117         }
1118
1119         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1120
1121         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1122
1123         if (bNS && fr->gpuBonded)
1124         {
1125             /* Now we put all atoms on the grid, we can assign bonded
1126              * interactions to the GPU, where the grid order is
1127              * needed. Also the xq, f and fshift device buffers have
1128              * been reallocated if needed, so the bonded code can
1129              * learn about them. */
1130             // TODO the xq, f, and fshift buffers are now shared
1131             // resources, so they should be maintained by a
1132             // higher-level object than the nb module.
1133             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1134                                                                   top->idef,
1135                                                                   Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1136                                                                   Nbnxm::gpu_get_f(nbv->gpu_nbv),
1137                                                                   Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1138             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1139         }
1140
1141         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1142     }
1143
1144     /* do local pair search */
1145     if (bNS)
1146     {
1147         nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
1148
1149         wallcycle_start_nocount(wcycle, ewcNS);
1150         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1151         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1152                             &top->excls,
1153                             nbv->listParams->rlistOuter,
1154                             nbv->min_ci_balanced,
1155                             &pairlistSet,
1156                             Nbnxm::InteractionLocality::Local,
1157                             nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
1158                             nrnb);
1159         pairlistSet.outerListCreationStep = step;
1160         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1161         {
1162             nbnxnPrepareListForDynamicPruning(&pairlistSet);
1163         }
1164         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1165
1166         if (bUseGPU)
1167         {
1168             /* initialize local pair-list on the GPU */
1169             Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1170                                      pairlistSet.nblGpu[0],
1171                                      Nbnxm::InteractionLocality::Local);
1172         }
1173         wallcycle_stop(wcycle, ewcNS);
1174     }
1175     else
1176     {
1177         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1178                                         FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1179                                         nbv->nbat, wcycle);
1180     }
1181
1182     if (bUseGPU)
1183     {
1184         if (DOMAINDECOMP(cr))
1185         {
1186             ddOpenBalanceRegionGpu(cr->dd);
1187         }
1188
1189         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1190
1191         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1192         Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1193         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1194
1195         // bonded work not split into separate local and non-local, so with DD
1196         // we can only launch the kernel after non-local coordinates have been received.
1197         if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1198         {
1199             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1200             fr->gpuBonded->launchKernels(fr, flags, box);
1201             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1202         }
1203
1204         /* launch local nonbonded work on GPU */
1205         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1206         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
1207                      step, nrnb, wcycle);
1208         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1209         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1210     }
1211
1212     if (useGpuPme)
1213     {
1214         // In PME GPU and mixed mode we launch FFT / gather after the
1215         // X copy/transform to allow overlap as well as after the GPU NB
1216         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1217         // the nonbonded kernel.
1218         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1219     }
1220
1221     /* Communicate coordinates and sum dipole if necessary +
1222        do non-local pair search */
1223     if (DOMAINDECOMP(cr))
1224     {
1225         nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
1226
1227         if (bNS)
1228         {
1229             wallcycle_start_nocount(wcycle, ewcNS);
1230             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1231
1232             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1233                                 &top->excls,
1234                                 nbv->listParams->rlistOuter,
1235                                 nbv->min_ci_balanced,
1236                                 &pairlistSet,
1237                                 Nbnxm::InteractionLocality::NonLocal,
1238                                 nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
1239                                 nrnb);
1240             pairlistSet.outerListCreationStep = step;
1241             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1242             {
1243                 nbnxnPrepareListForDynamicPruning(&pairlistSet);
1244             }
1245             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1246
1247             if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
1248             {
1249                 /* initialize non-local pair-list on the GPU */
1250                 Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
1251                                          pairlistSet.nblGpu[0],
1252                                          Nbnxm::InteractionLocality::NonLocal);
1253             }
1254             wallcycle_stop(wcycle, ewcNS);
1255         }
1256         else
1257         {
1258             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1259
1260             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1261                                             FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1262                                             nbv->nbat, wcycle);
1263         }
1264
1265         if (bUseGPU)
1266         {
1267             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1268
1269             /* launch non-local nonbonded tasks on GPU */
1270             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1271             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1272             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1273
1274             if (ppForceWorkload->haveGpuBondedWork)
1275             {
1276                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1277                 fr->gpuBonded->launchKernels(fr, flags, box);
1278                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1279             }
1280
1281             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1283                          step, nrnb, wcycle);
1284             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1285
1286             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1287         }
1288     }
1289
1290     if (bUseGPU)
1291     {
1292         /* launch D2H copy-back F */
1293         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1294         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1295         if (DOMAINDECOMP(cr))
1296         {
1297             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1298                                       flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
1299         }
1300         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1301                                   flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
1302         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1303
1304         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1305         {
1306             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1307             fr->gpuBonded->launchEnergyTransfer();
1308             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1309         }
1310         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1311     }
1312
1313     if (bStateChanged && inputrecNeedMutot(inputrec))
1314     {
1315         if (PAR(cr))
1316         {
1317             gmx_sumd(2*DIM, mu, cr);
1318             ddReopenBalanceRegionCpu(cr->dd);
1319         }
1320
1321         for (i = 0; i < 2; i++)
1322         {
1323             for (j = 0; j < DIM; j++)
1324             {
1325                 fr->mu_tot[i][j] = mu[i*DIM + j];
1326             }
1327         }
1328     }
1329     if (fr->efep == efepNO)
1330     {
1331         copy_rvec(fr->mu_tot[0], mu_tot);
1332     }
1333     else
1334     {
1335         for (j = 0; j < DIM; j++)
1336         {
1337             mu_tot[j] =
1338                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1339                 lambda[efptCOUL]*fr->mu_tot[1][j];
1340         }
1341     }
1342
1343     /* Reset energies */
1344     reset_enerdata(enerd);
1345     clear_rvecs(SHIFTS, fr->fshift);
1346
1347     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1348     {
1349         wallcycle_start(wcycle, ewcPPDURINGPME);
1350         dd_force_flop_start(cr->dd, nrnb);
1351     }
1352
1353     if (inputrec->bRot)
1354     {
1355         wallcycle_start(wcycle, ewcROT);
1356         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1357         wallcycle_stop(wcycle, ewcROT);
1358     }
1359
1360     /* Temporary solution until all routines take PaddedRVecVector */
1361     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1362
1363     /* Start the force cycle counter.
1364      * Note that a different counter is used for dynamic load balancing.
1365      */
1366     wallcycle_start(wcycle, ewcFORCE);
1367
1368     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1369     if (bDoForces)
1370     {
1371         /* If we need to compute the virial, we might need a separate
1372          * force buffer for algorithms for which the virial is calculated
1373          * directly, such as PME.
1374          */
1375         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1376         {
1377             forceRef = *fr->forceBufferForDirectVirialContributions;
1378
1379             /* TODO: update comment
1380              * We only compute forces on local atoms. Note that vsites can
1381              * spread to non-local atoms, but that part of the buffer is
1382              * cleared separately in the vsite spreading code.
1383              */
1384             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1385         }
1386         /* Clear the short- and long-range forces */
1387         clear_rvecs_omp(fr->natoms_force_constr, f);
1388     }
1389
1390     /* forceWithVirial uses the local atom range only */
1391     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1392
1393     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1394     {
1395         clear_pull_forces(inputrec->pull_work);
1396     }
1397
1398     /* We calculate the non-bonded forces, when done on the CPU, here.
1399      * We do this before calling do_force_lowlevel, because in that
1400      * function, the listed forces are calculated before PME, which
1401      * does communication.  With this order, non-bonded and listed
1402      * force calculation imbalance can be balanced out by the domain
1403      * decomposition load balancing.
1404      */
1405
1406     if (!bUseOrEmulGPU)
1407     {
1408         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
1409                      step, nrnb, wcycle);
1410     }
1411
1412     if (fr->efep != efepNO)
1413     {
1414         /* Calculate the local and non-local free energy interactions here.
1415          * Happens here on the CPU both with and without GPU.
1416          */
1417         if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
1418         {
1419             do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
1420                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1421                              inputrec->fepvals, lambda,
1422                              enerd, flags, nrnb, wcycle);
1423         }
1424
1425         if (DOMAINDECOMP(cr) &&
1426             fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1427         {
1428             do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
1429                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1430                              inputrec->fepvals, lambda,
1431                              enerd, flags, nrnb, wcycle);
1432         }
1433     }
1434
1435     if (!bUseOrEmulGPU)
1436     {
1437         if (DOMAINDECOMP(cr))
1438         {
1439             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
1440                          step, nrnb, wcycle);
1441         }
1442
1443         const Nbnxm::InteractionLocality iloc =
1444             (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
1445
1446         /* Add all the non-bonded force to the normal force array.
1447          * This can be split into a local and a non-local part when overlapping
1448          * communication with calculation with domain decomposition.
1449          */
1450         wallcycle_stop(wcycle, ewcFORCE);
1451
1452         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
1453
1454         wallcycle_start_nocount(wcycle, ewcFORCE);
1455
1456         /* if there are multiple fshift output buffers reduce them */
1457         if ((flags & GMX_FORCE_VIRIAL) &&
1458             nbv->grp[iloc].nbl_lists.nnbl > 1)
1459         {
1460             /* This is not in a subcounter because it takes a
1461                negligible and constant-sized amount of time */
1462             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1463                                                      fr->fshift);
1464         }
1465     }
1466
1467     /* update QMMMrec, if necessary */
1468     if (fr->bQMMM)
1469     {
1470         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1471     }
1472
1473     /* Compute the bonded and non-bonded energies and optionally forces */
1474     do_force_lowlevel(fr, inputrec, &(top->idef),
1475                       cr, ms, nrnb, wcycle, mdatoms,
1476                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1477                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1478                       flags, &cycles_pme);
1479
1480     wallcycle_stop(wcycle, ewcFORCE);
1481
1482     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1483                          step, t, wcycle,
1484                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1485                          flags, &forceWithVirial, enerd,
1486                          ed, bNS);
1487
1488     if (bUseOrEmulGPU)
1489     {
1490         /* wait for non-local forces (or calculate in emulation mode) */
1491         if (DOMAINDECOMP(cr))
1492         {
1493             if (bUseGPU)
1494             {
1495                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1496                 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1497                                             flags, Nbnxm::AtomLocality::NonLocal,
1498                                             ppForceWorkload->haveGpuBondedWork,
1499                                             enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1500                                             fr->fshift);
1501                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1502             }
1503             else
1504             {
1505                 wallcycle_start_nocount(wcycle, ewcFORCE);
1506                 do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
1507                              step, nrnb, wcycle);
1508                 wallcycle_stop(wcycle, ewcFORCE);
1509             }
1510
1511             /* skip the reduction if there was no non-local work to do */
1512             if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
1513             {
1514                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
1515                                                nbv->nbat, f, wcycle);
1516             }
1517         }
1518     }
1519
1520     if (DOMAINDECOMP(cr))
1521     {
1522         /* We are done with the CPU compute.
1523          * We will now communicate the non-local forces.
1524          * If we use a GPU this will overlap with GPU work, so in that case
1525          * we do not close the DD force balancing region here.
1526          */
1527         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1528         {
1529             ddCloseBalanceRegionCpu(cr->dd);
1530         }
1531         if (bDoForces)
1532         {
1533             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1534         }
1535     }
1536
1537     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1538     // an alternating wait/reduction scheme.
1539     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1540     if (alternateGpuWait)
1541     {
1542         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1543     }
1544
1545     if (!alternateGpuWait && useGpuPme)
1546     {
1547         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1548     }
1549
1550     /* Wait for local GPU NB outputs on the non-alternating wait path */
1551     if (!alternateGpuWait && bUseGPU)
1552     {
1553         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1554          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1555          * but even with a step of 0.1 ms the difference is less than 1%
1556          * of the step time.
1557          */
1558         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1559
1560         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1561         Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
1562                                     flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
1563                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1564                                     fr->fshift);
1565         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1566
1567         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1568         {
1569             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1570             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1571             {
1572                 /* We measured few cycles, it could be that the kernel
1573                  * and transfer finished earlier and there was no actual
1574                  * wait time, only API call overhead.
1575                  * Then the actual time could be anywhere between 0 and
1576                  * cycles_wait_est. We will use half of cycles_wait_est.
1577                  */
1578                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1579             }
1580             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1581         }
1582     }
1583
1584     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1585     {
1586         // NOTE: emulation kernel is not included in the balancing region,
1587         // but emulation mode does not target performance anyway
1588         wallcycle_start_nocount(wcycle, ewcFORCE);
1589         do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
1590                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1591                      step, nrnb, wcycle);
1592         wallcycle_stop(wcycle, ewcFORCE);
1593     }
1594
1595     if (useGpuPme)
1596     {
1597         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1598     }
1599
1600     if (bUseGPU)
1601     {
1602         /* now clear the GPU outputs while we finish the step on the CPU */
1603         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1604         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1605         Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
1606
1607         /* Is dynamic pair-list pruning activated? */
1608         if (nbv->listParams->useDynamicPruning)
1609         {
1610             launchGpuRollingPruning(cr, nbv, inputrec, step);
1611         }
1612         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1613         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1614     }
1615
1616     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1617     {
1618         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1619         // in principle this should be included in the DD balancing region,
1620         // but generally it is infrequent so we'll omit it for the sake of
1621         // simpler code
1622         fr->gpuBonded->accumulateEnergyTerms(enerd);
1623         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1624
1625         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1626         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1627         fr->gpuBonded->clearEnergies();
1628         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1629         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1630     }
1631
1632     /* Do the nonbonded GPU (or emulation) force buffer reduction
1633      * on the non-alternating path. */
1634     if (bUseOrEmulGPU && !alternateGpuWait)
1635     {
1636         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
1637                                        nbv->nbat, f, wcycle);
1638     }
1639     if (DOMAINDECOMP(cr))
1640     {
1641         dd_force_flop_stop(cr->dd, nrnb);
1642     }
1643
1644     if (bDoForces)
1645     {
1646         /* If we have NoVirSum forces, but we do not calculate the virial,
1647          * we sum fr->f_novirsum=f later.
1648          */
1649         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1650         {
1651             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1652                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1653         }
1654
1655         if (flags & GMX_FORCE_VIRIAL)
1656         {
1657             /* Calculation of the virial must be done after vsites! */
1658             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1659                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1660         }
1661     }
1662
1663     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1664     {
1665         /* In case of node-splitting, the PP nodes receive the long-range
1666          * forces, virial and energy from the PME nodes here.
1667          */
1668         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1669     }
1670
1671     if (bDoForces)
1672     {
1673         post_process_forces(cr, step, nrnb, wcycle,
1674                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1675                             vir_force, mdatoms, graph, fr, vsite,
1676                             flags);
1677     }
1678
1679     if (flags & GMX_FORCE_ENERGY)
1680     {
1681         /* Sum the potential energy terms from group contributions */
1682         sum_epot(&(enerd->grpp), enerd->term);
1683
1684         if (!EI_TPI(inputrec->eI))
1685         {
1686             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1687         }
1688     }
1689 }
1690
1691 static void do_force_cutsGROUP(FILE *fplog,
1692                                const t_commrec *cr,
1693                                const gmx_multisim_t *ms,
1694                                const t_inputrec *inputrec,
1695                                gmx::Awh *awh,
1696                                gmx_enfrot *enforcedRotation,
1697                                int64_t step,
1698                                t_nrnb *nrnb,
1699                                gmx_wallcycle_t wcycle,
1700                                gmx_localtop_t *top,
1701                                const gmx_groups_t *groups,
1702                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1703                                history_t *hist,
1704                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1705                                tensor vir_force,
1706                                const t_mdatoms *mdatoms,
1707                                gmx_enerdata_t *enerd,
1708                                t_fcdata *fcd,
1709                                real *lambda,
1710                                t_graph *graph,
1711                                t_forcerec *fr,
1712                                const gmx_vsite_t *vsite,
1713                                rvec mu_tot,
1714                                double t,
1715                                gmx_edsam *ed,
1716                                int flags,
1717                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1718                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1719 {
1720     int        cg0, cg1, i, j;
1721     double     mu[2*DIM];
1722     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1723     gmx_bool   bDoForces;
1724     float      cycles_pme;
1725
1726     const int  start  = 0;
1727     const int  homenr = mdatoms->homenr;
1728
1729     clear_mat(vir_force);
1730
1731     cg0 = 0;
1732     if (DOMAINDECOMP(cr))
1733     {
1734         cg1 = cr->dd->globalAtomGroupIndices.size();
1735     }
1736     else
1737     {
1738         cg1 = top->cgs.nr;
1739     }
1740     if (fr->n_tpi > 0)
1741     {
1742         cg1--;
1743     }
1744
1745     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1746     bNS            = ((flags & GMX_FORCE_NS) != 0);
1747     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1748     bFillGrid      = (bNS && bStateChanged);
1749     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1750     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1751
1752     if (bStateChanged)
1753     {
1754         update_forcerec(fr, box);
1755
1756         if (inputrecNeedMutot(inputrec))
1757         {
1758             /* Calculate total (local) dipole moment in a temporary common array.
1759              * This makes it possible to sum them over nodes faster.
1760              */
1761             calc_mu(start, homenr,
1762                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1763                     mu, mu+DIM);
1764         }
1765     }
1766
1767     if (fr->ePBC != epbcNONE)
1768     {
1769         /* Compute shift vectors every step,
1770          * because of pressure coupling or box deformation!
1771          */
1772         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1773         {
1774             calc_shifts(box, fr->shift_vec);
1775         }
1776
1777         if (bCalcCGCM)
1778         {
1779             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1780                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1781             inc_nrnb(nrnb, eNR_CGCM, homenr);
1782             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1783         }
1784         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1785         {
1786             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1787         }
1788     }
1789     else if (bCalcCGCM)
1790     {
1791         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1792         inc_nrnb(nrnb, eNR_CGCM, homenr);
1793     }
1794
1795     if (bCalcCGCM && gmx_debug_at)
1796     {
1797         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1798     }
1799
1800 #if GMX_MPI
1801     if (!thisRankHasDuty(cr, DUTY_PME))
1802     {
1803         /* Send particle coordinates to the pme nodes.
1804          * Since this is only implemented for domain decomposition
1805          * and domain decomposition does not use the graph,
1806          * we do not need to worry about shifting.
1807          */
1808         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1809                                  lambda[efptCOUL], lambda[efptVDW],
1810                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1811                                  step, wcycle);
1812     }
1813 #endif /* GMX_MPI */
1814
1815     /* Communicate coordinates and sum dipole if necessary */
1816     if (DOMAINDECOMP(cr))
1817     {
1818         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1819
1820         /* No GPU support, no move_x overlap, so reopen the balance region here */
1821         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1822         {
1823             ddReopenBalanceRegionCpu(cr->dd);
1824         }
1825     }
1826
1827     if (inputrecNeedMutot(inputrec))
1828     {
1829         if (bStateChanged)
1830         {
1831             if (PAR(cr))
1832             {
1833                 gmx_sumd(2*DIM, mu, cr);
1834                 ddReopenBalanceRegionCpu(cr->dd);
1835             }
1836             for (i = 0; i < 2; i++)
1837             {
1838                 for (j = 0; j < DIM; j++)
1839                 {
1840                     fr->mu_tot[i][j] = mu[i*DIM + j];
1841                 }
1842             }
1843         }
1844         if (fr->efep == efepNO)
1845         {
1846             copy_rvec(fr->mu_tot[0], mu_tot);
1847         }
1848         else
1849         {
1850             for (j = 0; j < DIM; j++)
1851             {
1852                 mu_tot[j] =
1853                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1854             }
1855         }
1856     }
1857
1858     /* Reset energies */
1859     reset_enerdata(enerd);
1860     clear_rvecs(SHIFTS, fr->fshift);
1861
1862     if (bNS)
1863     {
1864         wallcycle_start(wcycle, ewcNS);
1865
1866         if (graph && bStateChanged)
1867         {
1868             /* Calculate intramolecular shift vectors to make molecules whole */
1869             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1870         }
1871
1872         /* Do the actual neighbour searching */
1873         ns(fplog, fr, box,
1874            groups, top, mdatoms,
1875            cr, nrnb, bFillGrid);
1876
1877         wallcycle_stop(wcycle, ewcNS);
1878     }
1879
1880     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1881     {
1882         wallcycle_start(wcycle, ewcPPDURINGPME);
1883         dd_force_flop_start(cr->dd, nrnb);
1884     }
1885
1886     if (inputrec->bRot)
1887     {
1888         wallcycle_start(wcycle, ewcROT);
1889         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1890         wallcycle_stop(wcycle, ewcROT);
1891     }
1892
1893     /* Temporary solution until all routines take PaddedRVecVector */
1894     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1895
1896     /* Start the force cycle counter.
1897      * Note that a different counter is used for dynamic load balancing.
1898      */
1899     wallcycle_start(wcycle, ewcFORCE);
1900
1901     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1902     if (bDoForces)
1903     {
1904         /* If we need to compute the virial, we might need a separate
1905          * force buffer for algorithms for which the virial is calculated
1906          * separately, such as PME.
1907          */
1908         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1909         {
1910             forceRef = *fr->forceBufferForDirectVirialContributions;
1911
1912             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1913         }
1914
1915         /* Clear the short- and long-range forces */
1916         clear_rvecs(fr->natoms_force_constr, f);
1917     }
1918
1919     /* forceWithVirial might need the full force atom range */
1920     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1921
1922     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1923     {
1924         clear_pull_forces(inputrec->pull_work);
1925     }
1926
1927     /* update QMMMrec, if necessary */
1928     if (fr->bQMMM)
1929     {
1930         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1931     }
1932
1933     /* Compute the bonded and non-bonded energies and optionally forces */
1934     do_force_lowlevel(fr, inputrec, &(top->idef),
1935                       cr, ms, nrnb, wcycle, mdatoms,
1936                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1937                       box, inputrec->fepvals, lambda,
1938                       graph, &(top->excls), fr->mu_tot,
1939                       flags,
1940                       &cycles_pme);
1941
1942     wallcycle_stop(wcycle, ewcFORCE);
1943
1944     if (DOMAINDECOMP(cr))
1945     {
1946         dd_force_flop_stop(cr->dd, nrnb);
1947
1948         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1949         {
1950             ddCloseBalanceRegionCpu(cr->dd);
1951         }
1952     }
1953
1954     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1955                          step, t, wcycle,
1956                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1957                          flags, &forceWithVirial, enerd,
1958                          ed, bNS);
1959
1960     if (bDoForces)
1961     {
1962         /* Communicate the forces */
1963         if (DOMAINDECOMP(cr))
1964         {
1965             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1966             /* Do we need to communicate the separate force array
1967              * for terms that do not contribute to the single sum virial?
1968              * Position restraints and electric fields do not introduce
1969              * inter-cg forces, only full electrostatics methods do.
1970              * When we do not calculate the virial, fr->f_novirsum = f,
1971              * so we have already communicated these forces.
1972              */
1973             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1974                 (flags & GMX_FORCE_VIRIAL))
1975             {
1976                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
1977             }
1978         }
1979
1980         /* If we have NoVirSum forces, but we do not calculate the virial,
1981          * we sum fr->f_novirsum=f later.
1982          */
1983         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1984         {
1985             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1986                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1987         }
1988
1989         if (flags & GMX_FORCE_VIRIAL)
1990         {
1991             /* Calculation of the virial must be done after vsites! */
1992             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1993                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1994         }
1995     }
1996
1997     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1998     {
1999         /* In case of node-splitting, the PP nodes receive the long-range
2000          * forces, virial and energy from the PME nodes here.
2001          */
2002         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2003     }
2004
2005     if (bDoForces)
2006     {
2007         post_process_forces(cr, step, nrnb, wcycle,
2008                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2009                             vir_force, mdatoms, graph, fr, vsite,
2010                             flags);
2011     }
2012
2013     if (flags & GMX_FORCE_ENERGY)
2014     {
2015         /* Sum the potential energy terms from group contributions */
2016         sum_epot(&(enerd->grpp), enerd->term);
2017
2018         if (!EI_TPI(inputrec->eI))
2019         {
2020             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2021         }
2022     }
2023
2024 }
2025
2026 void do_force(FILE                                     *fplog,
2027               const t_commrec                          *cr,
2028               const gmx_multisim_t                     *ms,
2029               const t_inputrec                         *inputrec,
2030               gmx::Awh                                 *awh,
2031               gmx_enfrot                               *enforcedRotation,
2032               int64_t                                   step,
2033               t_nrnb                                   *nrnb,
2034               gmx_wallcycle_t                           wcycle,
2035               gmx_localtop_t                           *top,
2036               const gmx_groups_t                       *groups,
2037               matrix                                    box,
2038               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
2039               history_t                                *hist,
2040               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
2041               tensor                                    vir_force,
2042               const t_mdatoms                          *mdatoms,
2043               gmx_enerdata_t                           *enerd,
2044               t_fcdata                                 *fcd,
2045               gmx::ArrayRef<real>                       lambda,
2046               t_graph                                  *graph,
2047               t_forcerec                               *fr,
2048               gmx::PpForceWorkload                     *ppForceWorkload,
2049               const gmx_vsite_t                        *vsite,
2050               rvec                                      mu_tot,
2051               double                                    t,
2052               gmx_edsam                                *ed,
2053               int                                       flags,
2054               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2055               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2056 {
2057     /* modify force flag if not doing nonbonded */
2058     if (!fr->bNonbonded)
2059     {
2060         flags &= ~GMX_FORCE_NONBONDED;
2061     }
2062
2063     switch (inputrec->cutoff_scheme)
2064     {
2065         case ecutsVERLET:
2066             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2067                                 awh, enforcedRotation, step, nrnb, wcycle,
2068                                 top,
2069                                 groups,
2070                                 box, x, hist,
2071                                 force, vir_force,
2072                                 mdatoms,
2073                                 enerd, fcd,
2074                                 lambda.data(), graph,
2075                                 fr,
2076                                 ppForceWorkload,
2077                                 fr->ic,
2078                                 vsite, mu_tot,
2079                                 t, ed,
2080                                 flags,
2081                                 ddOpenBalanceRegion,
2082                                 ddCloseBalanceRegion);
2083             break;
2084         case ecutsGROUP:
2085             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2086                                awh, enforcedRotation, step, nrnb, wcycle,
2087                                top,
2088                                groups,
2089                                box, x, hist,
2090                                force, vir_force,
2091                                mdatoms,
2092                                enerd, fcd,
2093                                lambda.data(), graph,
2094                                fr, vsite, mu_tot,
2095                                t, ed,
2096                                flags,
2097                                ddOpenBalanceRegion,
2098                                ddCloseBalanceRegion);
2099             break;
2100         default:
2101             gmx_incons("Invalid cut-off scheme passed!");
2102     }
2103
2104     /* In case we don't have constraints and are using GPUs, the next balancing
2105      * region starts here.
2106      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2107      * virial calculation and COM pulling, is not thus not included in
2108      * the balance timing, which is ok as most tasks do communication.
2109      */
2110     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2111     {
2112         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2113     }
2114 }
2115
2116
2117 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2118                         const t_inputrec *ir, const t_mdatoms *md,
2119                         t_state *state)
2120 {
2121     int             i, m, start, end;
2122     int64_t         step;
2123     real            dt = ir->delta_t;
2124     real            dvdl_dum;
2125     rvec           *savex;
2126
2127     /* We need to allocate one element extra, since we might use
2128      * (unaligned) 4-wide SIMD loads to access rvec entries.
2129      */
2130     snew(savex, state->natoms + 1);
2131
2132     start = 0;
2133     end   = md->homenr;
2134
2135     if (debug)
2136     {
2137         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2138                 start, md->homenr, end);
2139     }
2140     /* Do a first constrain to reset particles... */
2141     step = ir->init_step;
2142     if (fplog)
2143     {
2144         char buf[STEPSTRSIZE];
2145         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2146                 gmx_step_str(step, buf));
2147     }
2148     dvdl_dum = 0;
2149
2150     /* constrain the current position */
2151     constr->apply(TRUE, FALSE,
2152                   step, 0, 1.0,
2153                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2154                   state->box,
2155                   state->lambda[efptBONDED], &dvdl_dum,
2156                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2157     if (EI_VV(ir->eI))
2158     {
2159         /* constrain the inital velocity, and save it */
2160         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2161         constr->apply(TRUE, FALSE,
2162                       step, 0, 1.0,
2163                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2164                       state->box,
2165                       state->lambda[efptBONDED], &dvdl_dum,
2166                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2167     }
2168     /* constrain the inital velocities at t-dt/2 */
2169     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2170     {
2171         auto x = makeArrayRef(state->x).subArray(start, end);
2172         auto v = makeArrayRef(state->v).subArray(start, end);
2173         for (i = start; (i < end); i++)
2174         {
2175             for (m = 0; (m < DIM); m++)
2176             {
2177                 /* Reverse the velocity */
2178                 v[i][m] = -v[i][m];
2179                 /* Store the position at t-dt in buf */
2180                 savex[i][m] = x[i][m] + dt*v[i][m];
2181             }
2182         }
2183         /* Shake the positions at t=-dt with the positions at t=0
2184          * as reference coordinates.
2185          */
2186         if (fplog)
2187         {
2188             char buf[STEPSTRSIZE];
2189             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2190                     gmx_step_str(step, buf));
2191         }
2192         dvdl_dum = 0;
2193         constr->apply(TRUE, FALSE,
2194                       step, -1, 1.0,
2195                       state->x.rvec_array(), savex, nullptr,
2196                       state->box,
2197                       state->lambda[efptBONDED], &dvdl_dum,
2198                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2199
2200         for (i = start; i < end; i++)
2201         {
2202             for (m = 0; m < DIM; m++)
2203             {
2204                 /* Re-reverse the velocities */
2205                 v[i][m] = -v[i][m];
2206             }
2207         }
2208     }
2209     sfree(savex);
2210 }
2211
2212
2213 static void
2214 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2215                 double *enerout, double *virout)
2216 {
2217     double enersum, virsum;
2218     double invscale, invscale2, invscale3;
2219     double r, ea, eb, ec, pa, pb, pc, pd;
2220     double y0, f, g, h;
2221     int    ri, offset;
2222     double tabfactor;
2223
2224     invscale  = 1.0/scale;
2225     invscale2 = invscale*invscale;
2226     invscale3 = invscale*invscale2;
2227
2228     /* Following summation derived from cubic spline definition,
2229      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2230      * the cubic spline.  We first calculate the negative of the
2231      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2232      * add the more standard, abrupt cutoff correction to that result,
2233      * yielding the long-range correction for a switched function.  We
2234      * perform both the pressure and energy loops at the same time for
2235      * simplicity, as the computational cost is low. */
2236
2237     if (offstart == 0)
2238     {
2239         /* Since the dispersion table has been scaled down a factor
2240          * 6.0 and the repulsion a factor 12.0 to compensate for the
2241          * c6/c12 parameters inside nbfp[] being scaled up (to save
2242          * flops in kernels), we need to correct for this.
2243          */
2244         tabfactor = 6.0;
2245     }
2246     else
2247     {
2248         tabfactor = 12.0;
2249     }
2250
2251     enersum = 0.0;
2252     virsum  = 0.0;
2253     for (ri = rstart; ri < rend; ++ri)
2254     {
2255         r  = ri*invscale;
2256         ea = invscale3;
2257         eb = 2.0*invscale2*r;
2258         ec = invscale*r*r;
2259
2260         pa = invscale3;
2261         pb = 3.0*invscale2*r;
2262         pc = 3.0*invscale*r*r;
2263         pd = r*r*r;
2264
2265         /* this "8" is from the packing in the vdwtab array - perhaps
2266            should be defined? */
2267
2268         offset = 8*ri + offstart;
2269         y0     = vdwtab[offset];
2270         f      = vdwtab[offset+1];
2271         g      = vdwtab[offset+2];
2272         h      = vdwtab[offset+3];
2273
2274         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2275         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2276     }
2277     *enerout = 4.0*M_PI*enersum*tabfactor;
2278     *virout  = 4.0*M_PI*virsum*tabfactor;
2279 }
2280
2281 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2282 {
2283     double   eners[2], virs[2], enersum, virsum;
2284     double   r0, rc3, rc9;
2285     int      ri0, ri1, i;
2286     real     scale, *vdwtab;
2287
2288     fr->enershiftsix    = 0;
2289     fr->enershifttwelve = 0;
2290     fr->enerdiffsix     = 0;
2291     fr->enerdifftwelve  = 0;
2292     fr->virdiffsix      = 0;
2293     fr->virdifftwelve   = 0;
2294
2295     const interaction_const_t *ic = fr->ic;
2296
2297     if (eDispCorr != edispcNO)
2298     {
2299         for (i = 0; i < 2; i++)
2300         {
2301             eners[i] = 0;
2302             virs[i]  = 0;
2303         }
2304         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2305             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2306             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2307             (ic->vdwtype == evdwSHIFT) ||
2308             (ic->vdwtype == evdwSWITCH))
2309         {
2310             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2311                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2312                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2313             {
2314                 gmx_fatal(FARGS,
2315                           "With dispersion correction rvdw-switch can not be zero "
2316                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2317             }
2318
2319             /* TODO This code depends on the logic in tables.c that
2320                constructs the table layout, which should be made
2321                explicit in future cleanup. */
2322             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2323                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2324             scale  = fr->dispersionCorrectionTable->scale;
2325             vdwtab = fr->dispersionCorrectionTable->data;
2326
2327             /* Round the cut-offs to exact table values for precision */
2328             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2329             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2330
2331             /* The code below has some support for handling force-switching, i.e.
2332              * when the force (instead of potential) is switched over a limited
2333              * region. This leads to a constant shift in the potential inside the
2334              * switching region, which we can handle by adding a constant energy
2335              * term in the force-switch case just like when we do potential-shift.
2336              *
2337              * For now this is not enabled, but to keep the functionality in the
2338              * code we check separately for switch and shift. When we do force-switch
2339              * the shifting point is rvdw_switch, while it is the cutoff when we
2340              * have a classical potential-shift.
2341              *
2342              * For a pure potential-shift the potential has a constant shift
2343              * all the way out to the cutoff, and that is it. For other forms
2344              * we need to calculate the constant shift up to the point where we
2345              * start modifying the potential.
2346              */
2347             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2348
2349             r0   = ri0/scale;
2350             rc3  = r0*r0*r0;
2351             rc9  = rc3*rc3*rc3;
2352
2353             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2354                 (ic->vdwtype == evdwSHIFT))
2355             {
2356                 /* Determine the constant energy shift below rvdw_switch.
2357                  * Table has a scale factor since we have scaled it down to compensate
2358                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2359                  */
2360                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2361                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2362             }
2363             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2364             {
2365                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2366                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2367             }
2368
2369             /* Add the constant part from 0 to rvdw_switch.
2370              * This integration from 0 to rvdw_switch overcounts the number
2371              * of interactions by 1, as it also counts the self interaction.
2372              * We will correct for this later.
2373              */
2374             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2375             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2376
2377             /* Calculate the contribution in the range [r0,r1] where we
2378              * modify the potential. For a pure potential-shift modifier we will
2379              * have ri0==ri1, and there will not be any contribution here.
2380              */
2381             for (i = 0; i < 2; i++)
2382             {
2383                 enersum = 0;
2384                 virsum  = 0;
2385                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2386                 eners[i] -= enersum;
2387                 virs[i]  -= virsum;
2388             }
2389
2390             /* Alright: Above we compensated by REMOVING the parts outside r0
2391              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2392              *
2393              * Regardless of whether r0 is the point where we start switching,
2394              * or the cutoff where we calculated the constant shift, we include
2395              * all the parts we are missing out to infinity from r0 by
2396              * calculating the analytical dispersion correction.
2397              */
2398             eners[0] += -4.0*M_PI/(3.0*rc3);
2399             eners[1] +=  4.0*M_PI/(9.0*rc9);
2400             virs[0]  +=  8.0*M_PI/rc3;
2401             virs[1]  += -16.0*M_PI/(3.0*rc9);
2402         }
2403         else if (ic->vdwtype == evdwCUT ||
2404                  EVDW_PME(ic->vdwtype) ||
2405                  ic->vdwtype == evdwUSER)
2406         {
2407             if (ic->vdwtype == evdwUSER && fplog)
2408             {
2409                 fprintf(fplog,
2410                         "WARNING: using dispersion correction with user tables\n");
2411             }
2412
2413             /* Note that with LJ-PME, the dispersion correction is multiplied
2414              * by the difference between the actual C6 and the value of C6
2415              * that would produce the combination rule.
2416              * This means the normal energy and virial difference formulas
2417              * can be used here.
2418              */
2419
2420             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2421             rc9  = rc3*rc3*rc3;
2422             /* Contribution beyond the cut-off */
2423             eners[0] += -4.0*M_PI/(3.0*rc3);
2424             eners[1] +=  4.0*M_PI/(9.0*rc9);
2425             if (ic->vdw_modifier == eintmodPOTSHIFT)
2426             {
2427                 /* Contribution within the cut-off */
2428                 eners[0] += -4.0*M_PI/(3.0*rc3);
2429                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2430             }
2431             /* Contribution beyond the cut-off */
2432             virs[0]  +=  8.0*M_PI/rc3;
2433             virs[1]  += -16.0*M_PI/(3.0*rc9);
2434         }
2435         else
2436         {
2437             gmx_fatal(FARGS,
2438                       "Dispersion correction is not implemented for vdw-type = %s",
2439                       evdw_names[ic->vdwtype]);
2440         }
2441
2442         /* When we deprecate the group kernels the code below can go too */
2443         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2444         {
2445             /* Calculate self-interaction coefficient (assuming that
2446              * the reciprocal-space contribution is constant in the
2447              * region that contributes to the self-interaction).
2448              */
2449             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2450
2451             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2452             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2453         }
2454
2455         fr->enerdiffsix    = eners[0];
2456         fr->enerdifftwelve = eners[1];
2457         /* The 0.5 is due to the Gromacs definition of the virial */
2458         fr->virdiffsix     = 0.5*virs[0];
2459         fr->virdifftwelve  = 0.5*virs[1];
2460     }
2461 }
2462
2463 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2464                    const matrix box, real lambda, tensor pres, tensor virial,
2465                    real *prescorr, real *enercorr, real *dvdlcorr)
2466 {
2467     gmx_bool bCorrAll, bCorrPres;
2468     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2469     int      m;
2470
2471     *prescorr = 0;
2472     *enercorr = 0;
2473     *dvdlcorr = 0;
2474
2475     clear_mat(virial);
2476     clear_mat(pres);
2477
2478     if (ir->eDispCorr != edispcNO)
2479     {
2480         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2481                      ir->eDispCorr == edispcAllEnerPres);
2482         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2483                      ir->eDispCorr == edispcAllEnerPres);
2484
2485         invvol = 1/det(box);
2486         if (fr->n_tpi)
2487         {
2488             /* Only correct for the interactions with the inserted molecule */
2489             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2490             ninter = fr->n_tpi;
2491         }
2492         else
2493         {
2494             dens   = fr->numAtomsForDispersionCorrection*invvol;
2495             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2496         }
2497
2498         if (ir->efep == efepNO)
2499         {
2500             avcsix    = fr->avcsix[0];
2501             avctwelve = fr->avctwelve[0];
2502         }
2503         else
2504         {
2505             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2506             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2507         }
2508
2509         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2510         *enercorr += avcsix*enerdiff;
2511         dvdlambda  = 0.0;
2512         if (ir->efep != efepNO)
2513         {
2514             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2515         }
2516         if (bCorrAll)
2517         {
2518             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2519             *enercorr += avctwelve*enerdiff;
2520             if (fr->efep != efepNO)
2521             {
2522                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2523             }
2524         }
2525
2526         if (bCorrPres)
2527         {
2528             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2529             if (ir->eDispCorr == edispcAllEnerPres)
2530             {
2531                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2532             }
2533             /* The factor 2 is because of the Gromacs virial definition */
2534             spres = -2.0*invvol*svir*PRESFAC;
2535
2536             for (m = 0; m < DIM; m++)
2537             {
2538                 virial[m][m] += svir;
2539                 pres[m][m]   += spres;
2540             }
2541             *prescorr += spres;
2542         }
2543
2544         /* Can't currently control when it prints, for now, just print when degugging */
2545         if (debug)
2546         {
2547             if (bCorrAll)
2548             {
2549                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2550                         avcsix, avctwelve);
2551             }
2552             if (bCorrPres)
2553             {
2554                 fprintf(debug,
2555                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2556                         *enercorr, spres, svir);
2557             }
2558             else
2559             {
2560                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2561             }
2562         }
2563
2564         if (fr->efep != efepNO)
2565         {
2566             *dvdlcorr += dvdlambda;
2567         }
2568     }
2569 }
2570
2571 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2572                             const gmx_mtop_t *mtop, rvec x[],
2573                             gmx_bool bFirst)
2574 {
2575     t_graph        *graph;
2576     int             as, mol;
2577
2578     if (bFirst && fplog)
2579     {
2580         fprintf(fplog, "Removing pbc first time\n");
2581     }
2582
2583     snew(graph, 1);
2584     as = 0;
2585     for (const gmx_molblock_t &molb : mtop->molblock)
2586     {
2587         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2588         if (moltype.atoms.nr == 1 ||
2589             (!bFirst && moltype.cgs.nr == 1))
2590         {
2591             /* Just one atom or charge group in the molecule, no PBC required */
2592             as += molb.nmol*moltype.atoms.nr;
2593         }
2594         else
2595         {
2596             mk_graph_moltype(moltype, graph);
2597
2598             for (mol = 0; mol < molb.nmol; mol++)
2599             {
2600                 mk_mshift(fplog, graph, ePBC, box, x+as);
2601
2602                 shift_self(graph, box, x+as);
2603                 /* The molecule is whole now.
2604                  * We don't need the second mk_mshift call as in do_pbc_first,
2605                  * since we no longer need this graph.
2606                  */
2607
2608                 as += moltype.atoms.nr;
2609             }
2610             done_graph(graph);
2611         }
2612     }
2613     sfree(graph);
2614 }
2615
2616 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2617                        const gmx_mtop_t *mtop, rvec x[])
2618 {
2619     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2620 }
2621
2622 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2623                  const gmx_mtop_t *mtop, rvec x[])
2624 {
2625     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2626 }
2627
2628 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2629 {
2630     int t, nth;
2631     nth = gmx_omp_nthreads_get(emntDefault);
2632
2633 #pragma omp parallel for num_threads(nth) schedule(static)
2634     for (t = 0; t < nth; t++)
2635     {
2636         try
2637         {
2638             size_t natoms = x.size();
2639             size_t offset = (natoms*t    )/nth;
2640             size_t len    = (natoms*(t + 1))/nth - offset;
2641             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2642         }
2643         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2644     }
2645 }
2646
2647 // TODO This can be cleaned up a lot, and move back to runner.cpp
2648 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2649                 const t_inputrec *inputrec,
2650                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2651                 gmx_walltime_accounting_t walltime_accounting,
2652                 nonbonded_verlet_t *nbv,
2653                 const gmx_pme_t *pme,
2654                 gmx_bool bWriteStat)
2655 {
2656     t_nrnb *nrnb_tot = nullptr;
2657     double  delta_t  = 0;
2658     double  nbfs     = 0, mflop = 0;
2659     double  elapsed_time,
2660             elapsed_time_over_all_ranks,
2661             elapsed_time_over_all_threads,
2662             elapsed_time_over_all_threads_over_all_ranks;
2663     /* Control whether it is valid to print a report. Only the
2664        simulation master may print, but it should not do so if the run
2665        terminated e.g. before a scheduled reset step. This is
2666        complicated by the fact that PME ranks are unaware of the
2667        reason why they were sent a pmerecvqxFINISH. To avoid
2668        communication deadlocks, we always do the communication for the
2669        report, even if we've decided not to write the report, because
2670        how long it takes to finish the run is not important when we've
2671        decided not to report on the simulation performance.
2672
2673        Further, we only report performance for dynamical integrators,
2674        because those are the only ones for which we plan to
2675        consider doing any optimizations. */
2676     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2677
2678     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2679     {
2680         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2681         printReport = false;
2682     }
2683
2684     if (cr->nnodes > 1)
2685     {
2686         snew(nrnb_tot, 1);
2687 #if GMX_MPI
2688         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2689                       cr->mpi_comm_mysim);
2690 #endif
2691     }
2692     else
2693     {
2694         nrnb_tot = nrnb;
2695     }
2696
2697     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2698     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2699     if (cr->nnodes > 1)
2700     {
2701 #if GMX_MPI
2702         /* reduce elapsed_time over all MPI ranks in the current simulation */
2703         MPI_Allreduce(&elapsed_time,
2704                       &elapsed_time_over_all_ranks,
2705                       1, MPI_DOUBLE, MPI_SUM,
2706                       cr->mpi_comm_mysim);
2707         elapsed_time_over_all_ranks /= cr->nnodes;
2708         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2709          * current simulation. */
2710         MPI_Allreduce(&elapsed_time_over_all_threads,
2711                       &elapsed_time_over_all_threads_over_all_ranks,
2712                       1, MPI_DOUBLE, MPI_SUM,
2713                       cr->mpi_comm_mysim);
2714 #endif
2715     }
2716     else
2717     {
2718         elapsed_time_over_all_ranks                  = elapsed_time;
2719         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2720     }
2721
2722     if (printReport)
2723     {
2724         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2725     }
2726     if (cr->nnodes > 1)
2727     {
2728         sfree(nrnb_tot);
2729     }
2730
2731     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2732     {
2733         print_dd_statistics(cr, inputrec, fplog);
2734     }
2735
2736     /* TODO Move the responsibility for any scaling by thread counts
2737      * to the code that handled the thread region, so that there's a
2738      * mechanism to keep cycle counting working during the transition
2739      * to task parallelism. */
2740     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2741     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2742     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2743     auto cycle_sum(wallcycle_sum(cr, wcycle));
2744
2745     if (printReport)
2746     {
2747         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
2748         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2749         if (pme_gpu_task_enabled(pme))
2750         {
2751             pme_gpu_get_timings(pme, &pme_gpu_timings);
2752         }
2753         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2754                         elapsed_time_over_all_ranks,
2755                         wcycle, cycle_sum,
2756                         nbnxn_gpu_timings,
2757                         &pme_gpu_timings);
2758
2759         if (EI_DYNAMICS(inputrec->eI))
2760         {
2761             delta_t = inputrec->delta_t;
2762         }
2763
2764         if (fplog)
2765         {
2766             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2767                        elapsed_time_over_all_ranks,
2768                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2769                        delta_t, nbfs, mflop);
2770         }
2771         if (bWriteStat)
2772         {
2773             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2774                        elapsed_time_over_all_ranks,
2775                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2776                        delta_t, nbfs, mflop);
2777         }
2778     }
2779 }
2780
2781 void initialize_lambdas(FILE               *fplog,
2782                         const t_inputrec   &ir,
2783                         bool                isMaster,
2784                         int                *fep_state,
2785                         gmx::ArrayRef<real> lambda,
2786                         double             *lam0)
2787 {
2788     /* TODO: Clean up initialization of fep_state and lambda in
2789        t_state.  This function works, but could probably use a logic
2790        rewrite to keep all the different types of efep straight. */
2791
2792     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2793     {
2794         return;
2795     }
2796
2797     const t_lambda *fep = ir.fepvals;
2798     if (isMaster)
2799     {
2800         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2801                                              if checkpoint is set -- a kludge is in for now
2802                                              to prevent this.*/
2803     }
2804
2805     for (int i = 0; i < efptNR; i++)
2806     {
2807         double thisLambda;
2808         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2809         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2810         {
2811             thisLambda = fep->init_lambda;
2812         }
2813         else
2814         {
2815             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2816         }
2817         if (isMaster)
2818         {
2819             lambda[i] = thisLambda;
2820         }
2821         if (lam0 != nullptr)
2822         {
2823             lam0[i] = thisLambda;
2824         }
2825     }
2826     if (ir.bSimTemp)
2827     {
2828         /* need to rescale control temperatures to match current state */
2829         for (int i = 0; i < ir.opts.ngtc; i++)
2830         {
2831             if (ir.opts.ref_t[i] > 0)
2832             {
2833                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2834             }
2835         }
2836     }
2837
2838     /* Send to the log the information on the current lambdas */
2839     if (fplog != nullptr)
2840     {
2841         fprintf(fplog, "Initial vector of lambda components:[ ");
2842         for (const auto &l : lambda)
2843         {
2844             fprintf(fplog, "%10.4f ", l);
2845         }
2846         fprintf(fplog, "]\n");
2847     }
2848 }