clang-tidy: readability-non-const-parameter (2/2)
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <stdio.h>
44 #include <string.h>
45
46 #include <cmath>
47 #include <cstdint>
48
49 #include <array>
50
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/calcvir.h"
74 #include "gromacs/mdlib/constr.h"
75 #include "gromacs/mdlib/force.h"
76 #include "gromacs/mdlib/forcerec.h"
77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
78 #include "gromacs/mdlib/mdrun.h"
79 #include "gromacs/mdlib/nb_verlet.h"
80 #include "gromacs/mdlib/nbnxn_atomdata.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_grid.h"
83 #include "gromacs/mdlib/nbnxn_search.h"
84 #include "gromacs/mdlib/qmmm.h"
85 #include "gromacs/mdlib/update.h"
86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/forceoutput.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/state.h"
93 #include "gromacs/pbcutil/ishift.h"
94 #include "gromacs/pbcutil/mshift.h"
95 #include "gromacs/pbcutil/pbc.h"
96 #include "gromacs/pulling/pull.h"
97 #include "gromacs/pulling/pull_rotation.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/gpu_timing.h"
100 #include "gromacs/timing/wallcycle.h"
101 #include "gromacs/timing/wallcyclereporting.h"
102 #include "gromacs/timing/walltime_accounting.h"
103 #include "gromacs/topology/topology.h"
104 #include "gromacs/utility/arrayref.h"
105 #include "gromacs/utility/basedefinitions.h"
106 #include "gromacs/utility/cstringutil.h"
107 #include "gromacs/utility/exceptions.h"
108 #include "gromacs/utility/fatalerror.h"
109 #include "gromacs/utility/gmxassert.h"
110 #include "gromacs/utility/gmxmpi.h"
111 #include "gromacs/utility/logger.h"
112 #include "gromacs/utility/pleasecite.h"
113 #include "gromacs/utility/smalloc.h"
114 #include "gromacs/utility/sysinfo.h"
115
116 #include "nbnxn_gpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
118 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
119
120 // TODO: this environment variable allows us to verify before release
121 // that on less common architectures the total cost of polling is not larger than
122 // a blocking wait (so polling does not introduce overhead when the static
123 // PME-first ordering would suffice).
124 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
125
126
127 void print_time(FILE                     *out,
128                 gmx_walltime_accounting_t walltime_accounting,
129                 gmx_int64_t               step,
130                 t_inputrec               *ir,
131                 const t_commrec          *cr)
132 {
133     time_t finish;
134     char   timebuf[STRLEN];
135     double dt, elapsed_seconds, time_per_step;
136     char   buf[48];
137
138 #if !GMX_THREAD_MPI
139     if (!PAR(cr))
140 #endif
141     {
142         fprintf(out, "\r");
143     }
144     fprintf(out, "step %s", gmx_step_str(step, buf));
145     fflush(out);
146
147     if ((step >= ir->nstlist))
148     {
149         double seconds_since_epoch = gmx_gettime();
150         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
151         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
152         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
153
154         if (ir->nsteps >= 0)
155         {
156             if (dt >= 300)
157             {
158                 finish = (time_t) (seconds_since_epoch + dt);
159                 gmx_ctime_r(&finish, timebuf, STRLEN);
160                 sprintf(buf, "%s", timebuf);
161                 buf[strlen(buf)-1] = '\0';
162                 fprintf(out, ", will finish %s", buf);
163             }
164             else
165             {
166                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
167             }
168         }
169         else
170         {
171             fprintf(out, " performance: %.1f ns/day    ",
172                     ir->delta_t/1000*24*60*60/time_per_step);
173         }
174     }
175 #if !GMX_THREAD_MPI
176     if (PAR(cr))
177     {
178         fprintf(out, "\n");
179     }
180 #else
181     GMX_UNUSED_VALUE(cr);
182 #endif
183
184     fflush(out);
185 }
186
187 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
188                          double the_time)
189 {
190     char   time_string[STRLEN];
191
192     if (!fplog)
193     {
194         return;
195     }
196
197     {
198         int    i;
199         char   timebuf[STRLEN];
200         time_t temp_time = (time_t) the_time;
201
202         gmx_ctime_r(&temp_time, timebuf, STRLEN);
203         for (i = 0; timebuf[i] >= ' '; i++)
204         {
205             time_string[i] = timebuf[i];
206         }
207         time_string[i] = '\0';
208     }
209
210     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
211 }
212
213 void print_start(FILE *fplog, const t_commrec *cr,
214                  gmx_walltime_accounting_t walltime_accounting,
215                  const char *name)
216 {
217     char buf[STRLEN];
218
219     sprintf(buf, "Started %s", name);
220     print_date_and_time(fplog, cr->nodeid, buf,
221                         walltime_accounting_get_start_time_stamp(walltime_accounting));
222 }
223
224 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
225 {
226     const int end = forceToAdd.size();
227
228     // cppcheck-suppress unreadVariable
229     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
230 #pragma omp parallel for num_threads(nt) schedule(static)
231     for (int i = 0; i < end; i++)
232     {
233         rvec_inc(f[i], forceToAdd[i]);
234     }
235 }
236
237 static void pme_gpu_reduce_outputs(gmx_wallcycle_t            wcycle,
238                                    ForceWithVirial           *forceWithVirial,
239                                    ArrayRef<const gmx::RVec>  pmeForces,
240                                    gmx_enerdata_t            *enerd,
241                                    const tensor               vir_Q,
242                                    real                       Vlr_q)
243 {
244     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
245     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
246     forceWithVirial->addVirialContribution(vir_Q);
247     enerd->term[F_COUL_RECIP] += Vlr_q;
248     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
249     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
250 }
251
252 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
253                         tensor vir_part, t_graph *graph, matrix box,
254                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
255 {
256     int    i;
257
258     /* The short-range virial from surrounding boxes */
259     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
260     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
261
262     /* Calculate partial virial, for local atoms only, based on short range.
263      * Total virial is computed in global_stat, called from do_md
264      */
265     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
266     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
267
268     /* Add wall contribution */
269     for (i = 0; i < DIM; i++)
270     {
271         vir_part[i][ZZ] += fr->vir_wall_z[i];
272     }
273
274     if (debug)
275     {
276         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
277     }
278 }
279
280 static void pull_potential_wrapper(const t_commrec *cr,
281                                    const t_inputrec *ir,
282                                    matrix box, ArrayRef<const RVec> x,
283                                    ForceWithVirial *force,
284                                    const t_mdatoms *mdatoms,
285                                    gmx_enerdata_t *enerd,
286                                    real *lambda,
287                                    double t,
288                                    gmx_wallcycle_t wcycle)
289 {
290     t_pbc  pbc;
291     real   dvdl;
292
293     /* Calculate the center of mass forces, this requires communication,
294      * which is why pull_potential is called close to other communication.
295      */
296     wallcycle_start(wcycle, ewcPULLPOT);
297     set_pbc(&pbc, ir->ePBC, box);
298     dvdl                     = 0;
299     enerd->term[F_COM_PULL] +=
300         pull_potential(ir->pull_work, mdatoms, &pbc,
301                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
302     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
303     wallcycle_stop(wcycle, ewcPULLPOT);
304 }
305
306 static void pme_receive_force_ener(const t_commrec *cr,
307                                    ForceWithVirial *forceWithVirial,
308                                    gmx_enerdata_t  *enerd,
309                                    gmx_wallcycle_t  wcycle)
310 {
311     real   e_q, e_lj, dvdl_q, dvdl_lj;
312     float  cycles_ppdpme, cycles_seppme;
313
314     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
315     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
316
317     /* In case of node-splitting, the PP nodes receive the long-range
318      * forces, virial and energy from the PME nodes here.
319      */
320     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
321     dvdl_q  = 0;
322     dvdl_lj = 0;
323     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
324                       &cycles_seppme);
325     enerd->term[F_COUL_RECIP] += e_q;
326     enerd->term[F_LJ_RECIP]   += e_lj;
327     enerd->dvdl_lin[efptCOUL] += dvdl_q;
328     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
329
330     if (wcycle)
331     {
332         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
333     }
334     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
335 }
336
337 static void print_large_forces(FILE            *fp,
338                                const t_mdatoms *md,
339                                const t_commrec *cr,
340                                gmx_int64_t      step,
341                                real             forceTolerance,
342                                const rvec      *x,
343                                const rvec      *f)
344 {
345     real           force2Tolerance = gmx::square(forceTolerance);
346     std::uintmax_t numNonFinite    = 0;
347     for (int i = 0; i < md->homenr; i++)
348     {
349         real force2    = norm2(f[i]);
350         bool nonFinite = !std::isfinite(force2);
351         if (force2 >= force2Tolerance || nonFinite)
352         {
353             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
354                     step,
355                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
356         }
357         if (nonFinite)
358         {
359             numNonFinite++;
360         }
361     }
362     if (numNonFinite > 0)
363     {
364         /* Note that with MPI this fatal call on one rank might interrupt
365          * the printing on other ranks. But we can only avoid that with
366          * an expensive MPI barrier that we would need at each step.
367          */
368         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
369     }
370 }
371
372 static void post_process_forces(const t_commrec      *cr,
373                                 gmx_int64_t           step,
374                                 t_nrnb               *nrnb,
375                                 gmx_wallcycle_t       wcycle,
376                                 const gmx_localtop_t *top,
377                                 matrix                box,
378                                 rvec                  x[],
379                                 rvec                  f[],
380                                 ForceWithVirial      *forceWithVirial,
381                                 tensor                vir_force,
382                                 const t_mdatoms      *mdatoms,
383                                 t_graph              *graph,
384                                 t_forcerec           *fr,
385                                 const gmx_vsite_t    *vsite,
386                                 int                   flags)
387 {
388     if (fr->haveDirectVirialContributions)
389     {
390         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
391
392         if (vsite)
393         {
394             /* Spread the mesh force on virtual sites to the other particles...
395              * This is parallellized. MPI communication is performed
396              * if the constructing atoms aren't local.
397              */
398             matrix virial = { { 0 } };
399             spread_vsite_f(vsite, x, fDirectVir, nullptr,
400                            (flags & GMX_FORCE_VIRIAL), virial,
401                            nrnb,
402                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
403             forceWithVirial->addVirialContribution(virial);
404         }
405
406         if (flags & GMX_FORCE_VIRIAL)
407         {
408             /* Now add the forces, this is local */
409             sum_forces(f, forceWithVirial->force_);
410
411             /* Add the direct virial contributions */
412             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
413             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
414
415             if (debug)
416             {
417                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
418             }
419         }
420     }
421
422     if (fr->print_force >= 0)
423     {
424         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
425     }
426 }
427
428 static void do_nb_verlet(t_forcerec *fr,
429                          interaction_const_t *ic,
430                          gmx_enerdata_t *enerd,
431                          int flags, int ilocality,
432                          int clearF,
433                          gmx_int64_t step,
434                          t_nrnb *nrnb,
435                          gmx_wallcycle_t wcycle)
436 {
437     if (!(flags & GMX_FORCE_NONBONDED))
438     {
439         /* skip non-bonded calculation */
440         return;
441     }
442
443     nonbonded_verlet_t       *nbv  = fr->nbv;
444     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
445
446     /* GPU kernel launch overhead is already timed separately */
447     if (fr->cutoff_scheme != ecutsVERLET)
448     {
449         gmx_incons("Invalid cut-off scheme passed!");
450     }
451
452     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
453
454     if (!bUsingGpuKernels)
455     {
456         /* When dynamic pair-list  pruning is requested, we need to prune
457          * at nstlistPrune steps.
458          */
459         if (nbv->listParams->useDynamicPruning &&
460             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
461         {
462             /* Prune the pair-list beyond fr->ic->rlistPrune using
463              * the current coordinates of the atoms.
464              */
465             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
466             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
467             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
468         }
469
470         wallcycle_sub_start(wcycle, ewcsNONBONDED);
471     }
472
473     switch (nbvg->kernel_type)
474     {
475         case nbnxnk4x4_PlainC:
476         case nbnxnk4xN_SIMD_4xN:
477         case nbnxnk4xN_SIMD_2xNN:
478             nbnxn_kernel_cpu(nbvg,
479                              nbv->nbat,
480                              ic,
481                              fr->shift_vec,
482                              flags,
483                              clearF,
484                              fr->fshift[0],
485                              enerd->grpp.ener[egCOULSR],
486                              fr->bBHAM ?
487                              enerd->grpp.ener[egBHAMSR] :
488                              enerd->grpp.ener[egLJSR]);
489             break;
490
491         case nbnxnk8x8x8_GPU:
492             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
493             break;
494
495         case nbnxnk8x8x8_PlainC:
496             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
497                                  nbv->nbat, ic,
498                                  fr->shift_vec,
499                                  flags,
500                                  clearF,
501                                  nbv->nbat->out[0].f,
502                                  fr->fshift[0],
503                                  enerd->grpp.ener[egCOULSR],
504                                  fr->bBHAM ?
505                                  enerd->grpp.ener[egBHAMSR] :
506                                  enerd->grpp.ener[egLJSR]);
507             break;
508
509         default:
510             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
511
512     }
513     if (!bUsingGpuKernels)
514     {
515         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
516     }
517
518     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
519     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
520     {
521         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
522     }
523     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
524              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
525     {
526         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
527     }
528     else
529     {
530         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
531     }
532     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
533     if (flags & GMX_FORCE_ENERGY)
534     {
535         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
536         enr_nbnxn_kernel_ljc += 1;
537         enr_nbnxn_kernel_lj  += 1;
538     }
539
540     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
541              nbvg->nbl_lists.natpair_ljq);
542     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
543              nbvg->nbl_lists.natpair_lj);
544     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
545     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
546              nbvg->nbl_lists.natpair_q);
547
548     if (ic->vdw_modifier == eintmodFORCESWITCH)
549     {
550         /* We add up the switch cost separately */
551         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
552                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
553     }
554     if (ic->vdw_modifier == eintmodPOTSWITCH)
555     {
556         /* We add up the switch cost separately */
557         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
558                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
559     }
560     if (ic->vdwtype == evdwPME)
561     {
562         /* We add up the LJ Ewald cost separately */
563         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
564                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
565     }
566 }
567
568 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
569                              t_forcerec           *fr,
570                              rvec                  x[],
571                              rvec                  f[],
572                              const t_mdatoms      *mdatoms,
573                              t_lambda             *fepvals,
574                              real                 *lambda,
575                              gmx_enerdata_t       *enerd,
576                              int                   flags,
577                              t_nrnb               *nrnb,
578                              gmx_wallcycle_t       wcycle)
579 {
580     int              donb_flags;
581     nb_kernel_data_t kernel_data;
582     real             lam_i[efptNR];
583     real             dvdl_nb[efptNR];
584     int              th;
585     int              i, j;
586
587     donb_flags = 0;
588     /* Add short-range interactions */
589     donb_flags |= GMX_NONBONDED_DO_SR;
590
591     /* Currently all group scheme kernels always calculate (shift-)forces */
592     if (flags & GMX_FORCE_FORCES)
593     {
594         donb_flags |= GMX_NONBONDED_DO_FORCE;
595     }
596     if (flags & GMX_FORCE_VIRIAL)
597     {
598         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
599     }
600     if (flags & GMX_FORCE_ENERGY)
601     {
602         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
603     }
604
605     kernel_data.flags  = donb_flags;
606     kernel_data.lambda = lambda;
607     kernel_data.dvdl   = dvdl_nb;
608
609     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
610     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
611
612     /* reset free energy components */
613     for (i = 0; i < efptNR; i++)
614     {
615         dvdl_nb[i]  = 0;
616     }
617
618     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
619
620     wallcycle_sub_start(wcycle, ewcsNONBONDED);
621 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
622     for (th = 0; th < nbl_lists->nnbl; th++)
623     {
624         try
625         {
626             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
627                                       x, f, fr, mdatoms, &kernel_data, nrnb);
628         }
629         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
630     }
631
632     if (fepvals->sc_alpha != 0)
633     {
634         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
635         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
636     }
637     else
638     {
639         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
640         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
641     }
642
643     /* If we do foreign lambda and we have soft-core interactions
644      * we have to recalculate the (non-linear) energies contributions.
645      */
646     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
647     {
648         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
649         kernel_data.lambda         = lam_i;
650         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
651         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
652         /* Note that we add to kernel_data.dvdl, but ignore the result */
653
654         for (i = 0; i < enerd->n_lambda; i++)
655         {
656             for (j = 0; j < efptNR; j++)
657             {
658                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
659             }
660             reset_foreign_enerdata(enerd);
661 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
662             for (th = 0; th < nbl_lists->nnbl; th++)
663             {
664                 try
665                 {
666                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
667                                               x, f, fr, mdatoms, &kernel_data, nrnb);
668                 }
669                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
670             }
671
672             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
673             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
674         }
675     }
676
677     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
678 }
679
680 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
681 {
682     return nbv != nullptr && nbv->bUseGPU;
683 }
684
685 static inline void clear_rvecs_omp(int n, rvec v[])
686 {
687     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
688
689     /* Note that we would like to avoid this conditional by putting it
690      * into the omp pragma instead, but then we still take the full
691      * omp parallel for overhead (at least with gcc5).
692      */
693     if (nth == 1)
694     {
695         for (int i = 0; i < n; i++)
696         {
697             clear_rvec(v[i]);
698         }
699     }
700     else
701     {
702 #pragma omp parallel for num_threads(nth) schedule(static)
703         for (int i = 0; i < n; i++)
704         {
705             clear_rvec(v[i]);
706         }
707     }
708 }
709
710 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
711  *
712  * \param groupOptions  Group options, containing T-coupling options
713  */
714 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
715 {
716     real nrdfCoupled   = 0;
717     real nrdfUncoupled = 0;
718     real kineticEnergy = 0;
719     for (int g = 0; g < groupOptions.ngtc; g++)
720     {
721         if (groupOptions.tau_t[g] >= 0)
722         {
723             nrdfCoupled   += groupOptions.nrdf[g];
724             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
725         }
726         else
727         {
728             nrdfUncoupled += groupOptions.nrdf[g];
729         }
730     }
731
732     /* This conditional with > also catches nrdf=0 */
733     if (nrdfCoupled > nrdfUncoupled)
734     {
735         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
736     }
737     else
738     {
739         return 0;
740     }
741 }
742
743 /*! \brief This routine checks that the potential energy is finite.
744  *
745  * Always checks that the potential energy is finite. If step equals
746  * inputrec.init_step also checks that the magnitude of the potential energy
747  * is reasonable. Terminates with a fatal error when a check fails.
748  * Note that passing this check does not guarantee finite forces,
749  * since those use slightly different arithmetics. But in most cases
750  * there is just a narrow coordinate range where forces are not finite
751  * and energies are finite.
752  *
753  * \param[in] step      The step number, used for checking and printing
754  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
755  * \param[in] inputrec  The input record
756  */
757 static void checkPotentialEnergyValidity(gmx_int64_t           step,
758                                          const gmx_enerdata_t &enerd,
759                                          const t_inputrec     &inputrec)
760 {
761     /* Threshold valid for comparing absolute potential energy against
762      * the kinetic energy. Normally one should not consider absolute
763      * potential energy values, but with a factor of one million
764      * we should never get false positives.
765      */
766     constexpr real c_thresholdFactor = 1e6;
767
768     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
769     real           averageKineticEnergy = 0;
770     /* We only check for large potential energy at the initial step,
771      * because that is by far the most likely step for this too occur
772      * and because computing the average kinetic energy is not free.
773      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
774      * before they become NaN.
775      */
776     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
777     {
778         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
779     }
780
781     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
782                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
783     {
784         gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
785                   step,
786                   enerd.term[F_EPOT],
787                   energyIsNotFinite ? "not finite" : "extremely high",
788                   enerd.term[F_LJ],
789                   enerd.term[F_COUL_SR],
790                   energyIsNotFinite ? "non-finite" : "very high",
791                   energyIsNotFinite ? " or Nan" : "");
792     }
793 }
794
795 /*! \brief Compute forces and/or energies for special algorithms
796  *
797  * The intention is to collect all calls to algorithms that compute
798  * forces on local atoms only and that do not contribute to the local
799  * virial sum (but add their virial contribution separately).
800  * Eventually these should likely all become ForceProviders.
801  * Within this function the intention is to have algorithms that do
802  * global communication at the end, so global barriers within the MD loop
803  * are as close together as possible.
804  *
805  * \param[in]     fplog            The log file
806  * \param[in]     cr               The communication record
807  * \param[in]     inputrec         The input record
808  * \param[in]     awh              The Awh module (nullptr if none in use).
809  * \param[in]     step             The current MD step
810  * \param[in]     t                The current time
811  * \param[in,out] wcycle           Wallcycle accounting struct
812  * \param[in,out] forceProviders   Pointer to a list of force providers
813  * \param[in]     box              The unit cell
814  * \param[in]     x                The coordinates
815  * \param[in]     mdatoms          Per atom properties
816  * \param[in]     lambda           Array of free-energy lambda values
817  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
818  * \param[in,out] forceWithVirial  Force and virial buffers
819  * \param[in,out] enerd            Energy buffer
820  * \param[in,out] ed               Essential dynamics pointer
821  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
822  *
823  * \todo Remove bNS, which is used incorrectly.
824  * \todo Convert all other algorithms called here to ForceProviders.
825  */
826 static void
827 computeSpecialForces(FILE                *fplog,
828                      const t_commrec     *cr,
829                      const t_inputrec    *inputrec,
830                      gmx::Awh            *awh,
831                      gmx_int64_t          step,
832                      double               t,
833                      gmx_wallcycle_t      wcycle,
834                      ForceProviders      *forceProviders,
835                      matrix               box,
836                      ArrayRef<const RVec> x,
837                      const t_mdatoms     *mdatoms,
838                      real                *lambda,
839                      int                  forceFlags,
840                      ForceWithVirial     *forceWithVirial,
841                      gmx_enerdata_t      *enerd,
842                      const gmx_edsam     *ed,
843                      gmx_bool             bNS)
844 {
845     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
846
847     /* NOTE: Currently all ForceProviders only provide forces.
848      *       When they also provide energies, remove this conditional.
849      */
850     if (computeForces)
851     {
852         ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
853         ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
854
855         /* Collect forces from modules */
856         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
857     }
858
859     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
860     {
861         pull_potential_wrapper(cr, inputrec, box, x,
862                                forceWithVirial,
863                                mdatoms, enerd, lambda, t,
864                                wcycle);
865
866         if (awh)
867         {
868             enerd->term[F_COM_PULL] +=
869                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
870                                                   forceWithVirial,
871                                                   t, step, wcycle, fplog);
872         }
873     }
874
875     rvec *f = as_rvec_array(forceWithVirial->force_.data());
876
877     /* Add the forces from enforced rotation potentials (if any) */
878     if (inputrec->bRot)
879     {
880         wallcycle_start(wcycle, ewcROTadd);
881         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
882         wallcycle_stop(wcycle, ewcROTadd);
883     }
884
885     if (ed)
886     {
887         /* Note that since init_edsam() is called after the initialization
888          * of forcerec, edsam doesn't request the noVirSum force buffer.
889          * Thus if no other algorithm (e.g. PME) requires it, the forces
890          * here will contribute to the virial.
891          */
892         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
893     }
894
895     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
896     if (inputrec->bIMD && computeForces)
897     {
898         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
899     }
900 }
901
902 /*! \brief Launch the prepare_step and spread stages of PME GPU.
903  *
904  * \param[in]  pmedata       The PME structure
905  * \param[in]  box           The box matrix
906  * \param[in]  x             Coordinate array
907  * \param[in]  flags         Force flags
908  * \param[in]  wcycle        The wallcycle structure
909  */
910 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
911                                       matrix          box,
912                                       rvec            x[],
913                                       int             flags,
914                                       gmx_wallcycle_t wcycle)
915 {
916     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
917     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
918     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
919
920     pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
921     pme_gpu_launch_spread(pmedata, x, wcycle);
922 }
923
924 /*! \brief Launch the FFT and gather stages of PME GPU
925  *
926  * This function only implements setting the output forces (no accumulation).
927  *
928  * \param[in]  pmedata        The PME structure
929  * \param[in]  wcycle         The wallcycle structure
930  */
931 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
932                                      gmx_wallcycle_t   wcycle)
933 {
934     pme_gpu_launch_complex_transforms(pmedata, wcycle);
935     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
936 }
937
938 /*! \brief
939  *  Polling wait for either of the PME or nonbonded GPU tasks.
940  *
941  * Instead of a static order in waiting for GPU tasks, this function
942  * polls checking which of the two tasks completes first, and does the
943  * associated force buffer reduction overlapped with the other task.
944  * By doing that, unlike static scheduling order, it can always overlap
945  * one of the reductions, regardless of the GPU task completion order.
946  *
947  * \param[in]     nbv              Nonbonded verlet structure
948  * \param[in]     pmedata          PME module data
949  * \param[in,out] force            Force array to reduce task outputs into.
950  * \param[in,out] forceWithVirial  Force and virial buffers
951  * \param[in,out] fshift           Shift force output vector results are reduced into
952  * \param[in,out] enerd            Energy data structure results are reduced into
953  * \param[in]     flags            Force flags
954  * \param[in]     wcycle           The wallcycle structure
955  */
956 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
957                                         const gmx_pme_t                *pmedata,
958                                         gmx::PaddedArrayRef<gmx::RVec> *force,
959                                         ForceWithVirial                *forceWithVirial,
960                                         rvec                            fshift[],
961                                         gmx_enerdata_t                 *enerd,
962                                         int                             flags,
963                                         gmx_wallcycle_t                 wcycle)
964 {
965     bool isPmeGpuDone = false;
966     bool isNbGpuDone  = false;
967
968
969     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
970
971     while (!isPmeGpuDone || !isNbGpuDone)
972     {
973         if (!isPmeGpuDone)
974         {
975             matrix            vir_Q;
976             real              Vlr_q;
977
978             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
979             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
980                                                    vir_Q, &Vlr_q, completionType);
981
982             if (isPmeGpuDone)
983             {
984                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
985                                        enerd, vir_Q, Vlr_q);
986             }
987         }
988
989         if (!isNbGpuDone)
990         {
991             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
992             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
993             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
994                                                     flags, eatLocal,
995                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
996                                                     fshift, completionType);
997             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
998             // To get the call count right, when the task finished we
999             // issue a start/stop.
1000             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1001             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1002             if (isNbGpuDone)
1003             {
1004                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1005                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1006
1007                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1008                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
1009             }
1010         }
1011     }
1012 }
1013
1014 /*! \brief
1015  *  Launch the dynamic rolling pruning GPU task.
1016  *
1017  *  We currently alternate local/non-local list pruning in odd-even steps
1018  *  (only pruning every second step without DD).
1019  *
1020  * \param[in]     cr               The communication record
1021  * \param[in]     nbv              Nonbonded verlet structure
1022  * \param[in]     inputrec         The input record
1023  * \param[in]     step             The current MD step
1024  */
1025 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1026                                            const nonbonded_verlet_t *nbv,
1027                                            const t_inputrec         *inputrec,
1028                                            const gmx_int64_t         step)
1029 {
1030     /* We should not launch the rolling pruning kernel at a search
1031      * step or just before search steps, since that's useless.
1032      * Without domain decomposition we prune at even steps.
1033      * With domain decomposition we alternate local and non-local
1034      * pruning at even and odd steps.
1035      */
1036     int  numRollingParts     = nbv->listParams->numRollingParts;
1037     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1038     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1039     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1040     if (stepWithCurrentList > 0 &&
1041         stepWithCurrentList < inputrec->nstlist - 1 &&
1042         (stepIsEven || DOMAINDECOMP(cr)))
1043     {
1044         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1045                                           stepIsEven ? eintLocal : eintNonlocal,
1046                                           numRollingParts);
1047     }
1048 }
1049
1050 static void do_force_cutsVERLET(FILE *fplog,
1051                                 const t_commrec *cr,
1052                                 const gmx_multisim_t *ms,
1053                                 const t_inputrec *inputrec,
1054                                 gmx::Awh *awh,
1055                                 gmx_int64_t step,
1056                                 t_nrnb *nrnb,
1057                                 gmx_wallcycle_t wcycle,
1058                                 const gmx_localtop_t *top,
1059                                 const gmx_groups_t * /* groups */,
1060                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1061                                 history_t *hist,
1062                                 gmx::PaddedArrayRef<gmx::RVec> force,
1063                                 tensor vir_force,
1064                                 const t_mdatoms *mdatoms,
1065                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1066                                 real *lambda,
1067                                 t_graph *graph,
1068                                 t_forcerec *fr,
1069                                 interaction_const_t *ic,
1070                                 const gmx_vsite_t *vsite,
1071                                 rvec mu_tot,
1072                                 double t,
1073                                 const gmx_edsam *ed,
1074                                 int flags,
1075                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1076                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1077 {
1078     int                 cg1, i, j;
1079     double              mu[2*DIM];
1080     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1081     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1082     rvec                vzero, box_diag;
1083     float               cycles_pme, cycles_wait_gpu;
1084     nonbonded_verlet_t *nbv = fr->nbv;
1085
1086     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1087     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1088     bFillGrid     = (bNS && bStateChanged);
1089     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1090     bDoForces     = (flags & GMX_FORCE_FORCES);
1091     bUseGPU       = fr->nbv->bUseGPU;
1092     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1093
1094     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1095     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1096     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1097         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1098
1099     /* At a search step we need to start the first balancing region
1100      * somewhere early inside the step after communication during domain
1101      * decomposition (and not during the previous step as usual).
1102      */
1103     if (bNS &&
1104         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1105     {
1106         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1107     }
1108
1109     cycles_wait_gpu = 0;
1110
1111     const int start  = 0;
1112     const int homenr = mdatoms->homenr;
1113
1114     clear_mat(vir_force);
1115
1116     if (DOMAINDECOMP(cr))
1117     {
1118         cg1 = cr->dd->globalAtomGroupIndices.size();
1119     }
1120     else
1121     {
1122         cg1 = top->cgs.nr;
1123     }
1124     if (fr->n_tpi > 0)
1125     {
1126         cg1--;
1127     }
1128
1129     if (bStateChanged)
1130     {
1131         update_forcerec(fr, box);
1132
1133         if (inputrecNeedMutot(inputrec))
1134         {
1135             /* Calculate total (local) dipole moment in a temporary common array.
1136              * This makes it possible to sum them over nodes faster.
1137              */
1138             calc_mu(start, homenr,
1139                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1140                     mu, mu+DIM);
1141         }
1142     }
1143
1144     if (fr->ePBC != epbcNONE)
1145     {
1146         /* Compute shift vectors every step,
1147          * because of pressure coupling or box deformation!
1148          */
1149         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1150         {
1151             calc_shifts(box, fr->shift_vec);
1152         }
1153
1154         if (bCalcCGCM)
1155         {
1156             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1157             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1158         }
1159         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1160         {
1161             unshift_self(graph, box, as_rvec_array(x.data()));
1162         }
1163     }
1164
1165     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1166                                  fr->shift_vec, nbv->nbat);
1167
1168 #if GMX_MPI
1169     if (!thisRankHasDuty(cr, DUTY_PME))
1170     {
1171         /* Send particle coordinates to the pme nodes.
1172          * Since this is only implemented for domain decomposition
1173          * and domain decomposition does not use the graph,
1174          * we do not need to worry about shifting.
1175          */
1176         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1177                                  lambda[efptCOUL], lambda[efptVDW],
1178                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1179                                  step, wcycle);
1180     }
1181 #endif /* GMX_MPI */
1182
1183     if (useGpuPme)
1184     {
1185         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1186     }
1187
1188     /* do gridding for pair search */
1189     if (bNS)
1190     {
1191         if (graph && bStateChanged)
1192         {
1193             /* Calculate intramolecular shift vectors to make molecules whole */
1194             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1195         }
1196
1197         clear_rvec(vzero);
1198         box_diag[XX] = box[XX][XX];
1199         box_diag[YY] = box[YY][YY];
1200         box_diag[ZZ] = box[ZZ][ZZ];
1201
1202         wallcycle_start(wcycle, ewcNS);
1203         if (!DOMAINDECOMP(cr))
1204         {
1205             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1206             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1207                               0, vzero, box_diag,
1208                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1209                               0, nullptr,
1210                               nbv->grp[eintLocal].kernel_type,
1211                               nbv->nbat);
1212             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1213         }
1214         else
1215         {
1216             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1217             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1218                                        fr->cginfo, as_rvec_array(x.data()),
1219                                        nbv->grp[eintNonlocal].kernel_type,
1220                                        nbv->nbat);
1221             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1222         }
1223
1224         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1225         wallcycle_stop(wcycle, ewcNS);
1226     }
1227
1228     /* initialize the GPU atom data and copy shift vector */
1229     if (bUseGPU)
1230     {
1231         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1232         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1233
1234         if (bNS)
1235         {
1236             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1237         }
1238
1239         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1240
1241         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1242         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1243     }
1244
1245     /* do local pair search */
1246     if (bNS)
1247     {
1248         wallcycle_start_nocount(wcycle, ewcNS);
1249         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1250         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1251                             &top->excls,
1252                             nbv->listParams->rlistOuter,
1253                             nbv->min_ci_balanced,
1254                             &nbv->grp[eintLocal].nbl_lists,
1255                             eintLocal,
1256                             nbv->grp[eintLocal].kernel_type,
1257                             nrnb);
1258         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1259         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1260         {
1261             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1262         }
1263         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1264
1265         if (bUseGPU)
1266         {
1267             /* initialize local pair-list on the GPU */
1268             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1269                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1270                                     eintLocal);
1271         }
1272         wallcycle_stop(wcycle, ewcNS);
1273     }
1274     else
1275     {
1276         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1277                                         nbv->nbat, wcycle);
1278     }
1279
1280     if (bUseGPU)
1281     {
1282         if (DOMAINDECOMP(cr))
1283         {
1284             ddOpenBalanceRegionGpu(cr->dd);
1285         }
1286
1287         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1288         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1289         /* launch local nonbonded work on GPU */
1290         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1291                      step, nrnb, wcycle);
1292         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1294     }
1295
1296     if (useGpuPme)
1297     {
1298         // In PME GPU and mixed mode we launch FFT / gather after the
1299         // X copy/transform to allow overlap as well as after the GPU NB
1300         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1301         // the nonbonded kernel.
1302         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1303     }
1304
1305     /* Communicate coordinates and sum dipole if necessary +
1306        do non-local pair search */
1307     if (DOMAINDECOMP(cr))
1308     {
1309         if (bNS)
1310         {
1311             wallcycle_start_nocount(wcycle, ewcNS);
1312             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1313
1314             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1315                                 &top->excls,
1316                                 nbv->listParams->rlistOuter,
1317                                 nbv->min_ci_balanced,
1318                                 &nbv->grp[eintNonlocal].nbl_lists,
1319                                 eintNonlocal,
1320                                 nbv->grp[eintNonlocal].kernel_type,
1321                                 nrnb);
1322             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1323             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1324             {
1325                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1326             }
1327             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1328
1329             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1330             {
1331                 /* initialize non-local pair-list on the GPU */
1332                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1333                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1334                                         eintNonlocal);
1335             }
1336             wallcycle_stop(wcycle, ewcNS);
1337         }
1338         else
1339         {
1340             dd_move_x(cr->dd, box, x, wcycle);
1341
1342             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1343                                             nbv->nbat, wcycle);
1344         }
1345
1346         if (bUseGPU)
1347         {
1348             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1349             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350             /* launch non-local nonbonded tasks on GPU */
1351             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1352                          step, nrnb, wcycle);
1353             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1354             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1355         }
1356     }
1357
1358     if (bUseGPU)
1359     {
1360         /* launch D2H copy-back F */
1361         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1362         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1363         if (DOMAINDECOMP(cr))
1364         {
1365             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1366                                      flags, eatNonlocal);
1367         }
1368         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1369                                  flags, eatLocal);
1370         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1371         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1372     }
1373
1374     if (bStateChanged && inputrecNeedMutot(inputrec))
1375     {
1376         if (PAR(cr))
1377         {
1378             gmx_sumd(2*DIM, mu, cr);
1379             ddReopenBalanceRegionCpu(cr->dd);
1380         }
1381
1382         for (i = 0; i < 2; i++)
1383         {
1384             for (j = 0; j < DIM; j++)
1385             {
1386                 fr->mu_tot[i][j] = mu[i*DIM + j];
1387             }
1388         }
1389     }
1390     if (fr->efep == efepNO)
1391     {
1392         copy_rvec(fr->mu_tot[0], mu_tot);
1393     }
1394     else
1395     {
1396         for (j = 0; j < DIM; j++)
1397         {
1398             mu_tot[j] =
1399                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1400                 lambda[efptCOUL]*fr->mu_tot[1][j];
1401         }
1402     }
1403
1404     /* Reset energies */
1405     reset_enerdata(enerd);
1406     clear_rvecs(SHIFTS, fr->fshift);
1407
1408     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1409     {
1410         wallcycle_start(wcycle, ewcPPDURINGPME);
1411         dd_force_flop_start(cr->dd, nrnb);
1412     }
1413
1414     if (inputrec->bRot)
1415     {
1416         wallcycle_start(wcycle, ewcROT);
1417         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1418         wallcycle_stop(wcycle, ewcROT);
1419     }
1420
1421     /* Temporary solution until all routines take PaddedRVecVector */
1422     rvec *f = as_rvec_array(force.data());
1423
1424     /* Start the force cycle counter.
1425      * Note that a different counter is used for dynamic load balancing.
1426      */
1427     wallcycle_start(wcycle, ewcFORCE);
1428
1429     gmx::ArrayRef<gmx::RVec> forceRef = force;
1430     if (bDoForces)
1431     {
1432         /* If we need to compute the virial, we might need a separate
1433          * force buffer for algorithms for which the virial is calculated
1434          * directly, such as PME.
1435          */
1436         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1437         {
1438             forceRef = *fr->forceBufferForDirectVirialContributions;
1439
1440             /* We only compute forces on local atoms. Note that vsites can
1441              * spread to non-local atoms, but that part of the buffer is
1442              * cleared separately in the vsite spreading code.
1443              */
1444             clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1445         }
1446         /* Clear the short- and long-range forces */
1447         clear_rvecs_omp(fr->natoms_force_constr, f);
1448     }
1449
1450     /* forceWithVirial uses the local atom range only */
1451     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1452
1453     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1454     {
1455         clear_pull_forces(inputrec->pull_work);
1456     }
1457
1458     /* We calculate the non-bonded forces, when done on the CPU, here.
1459      * We do this before calling do_force_lowlevel, because in that
1460      * function, the listed forces are calculated before PME, which
1461      * does communication.  With this order, non-bonded and listed
1462      * force calculation imbalance can be balanced out by the domain
1463      * decomposition load balancing.
1464      */
1465
1466     if (!bUseOrEmulGPU)
1467     {
1468         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1469                      step, nrnb, wcycle);
1470     }
1471
1472     if (fr->efep != efepNO)
1473     {
1474         /* Calculate the local and non-local free energy interactions here.
1475          * Happens here on the CPU both with and without GPU.
1476          */
1477         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1478         {
1479             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1480                              fr, as_rvec_array(x.data()), f, mdatoms,
1481                              inputrec->fepvals, lambda,
1482                              enerd, flags, nrnb, wcycle);
1483         }
1484
1485         if (DOMAINDECOMP(cr) &&
1486             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1487         {
1488             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1489                              fr, as_rvec_array(x.data()), f, mdatoms,
1490                              inputrec->fepvals, lambda,
1491                              enerd, flags, nrnb, wcycle);
1492         }
1493     }
1494
1495     if (!bUseOrEmulGPU)
1496     {
1497         int aloc;
1498
1499         if (DOMAINDECOMP(cr))
1500         {
1501             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1502                          step, nrnb, wcycle);
1503         }
1504
1505         if (!bUseOrEmulGPU)
1506         {
1507             aloc = eintLocal;
1508         }
1509         else
1510         {
1511             aloc = eintNonlocal;
1512         }
1513
1514         /* Add all the non-bonded force to the normal force array.
1515          * This can be split into a local and a non-local part when overlapping
1516          * communication with calculation with domain decomposition.
1517          */
1518         wallcycle_stop(wcycle, ewcFORCE);
1519
1520         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1521
1522         wallcycle_start_nocount(wcycle, ewcFORCE);
1523
1524         /* if there are multiple fshift output buffers reduce them */
1525         if ((flags & GMX_FORCE_VIRIAL) &&
1526             nbv->grp[aloc].nbl_lists.nnbl > 1)
1527         {
1528             /* This is not in a subcounter because it takes a
1529                negligible and constant-sized amount of time */
1530             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1531                                                      fr->fshift);
1532         }
1533     }
1534
1535     /* update QMMMrec, if necessary */
1536     if (fr->bQMMM)
1537     {
1538         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1539     }
1540
1541     /* Compute the bonded and non-bonded energies and optionally forces */
1542     do_force_lowlevel(fr, inputrec, &(top->idef),
1543                       cr, ms, nrnb, wcycle, mdatoms,
1544                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1545                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1546                       flags, &cycles_pme);
1547
1548     wallcycle_stop(wcycle, ewcFORCE);
1549
1550     computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
1551                          fr->forceProviders, box, x, mdatoms, lambda,
1552                          flags, &forceWithVirial, enerd,
1553                          ed, bNS);
1554
1555     if (bUseOrEmulGPU)
1556     {
1557         /* wait for non-local forces (or calculate in emulation mode) */
1558         if (DOMAINDECOMP(cr))
1559         {
1560             if (bUseGPU)
1561             {
1562                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1563                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1564                                            flags, eatNonlocal,
1565                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1566                                            fr->fshift);
1567                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1568             }
1569             else
1570             {
1571                 wallcycle_start_nocount(wcycle, ewcFORCE);
1572                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1573                              step, nrnb, wcycle);
1574                 wallcycle_stop(wcycle, ewcFORCE);
1575             }
1576
1577             /* skip the reduction if there was no non-local work to do */
1578             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1579             {
1580                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1581                                                nbv->nbat, f, wcycle);
1582             }
1583         }
1584     }
1585
1586     if (DOMAINDECOMP(cr))
1587     {
1588         /* We are done with the CPU compute.
1589          * We will now communicate the non-local forces.
1590          * If we use a GPU this will overlap with GPU work, so in that case
1591          * we do not close the DD force balancing region here.
1592          */
1593         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1594         {
1595             ddCloseBalanceRegionCpu(cr->dd);
1596         }
1597         if (bDoForces)
1598         {
1599             dd_move_f(cr->dd, force, fr->fshift, wcycle);
1600         }
1601     }
1602
1603     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1604     // an alternating wait/reduction scheme.
1605     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1606     if (alternateGpuWait)
1607     {
1608         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1609     }
1610
1611     if (!alternateGpuWait && useGpuPme)
1612     {
1613         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1614         matrix vir_Q;
1615         real   Vlr_q = 0.0;
1616         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1617         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1618     }
1619
1620     /* Wait for local GPU NB outputs on the non-alternating wait path */
1621     if (!alternateGpuWait && bUseGPU)
1622     {
1623         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1624          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1625          * but even with a step of 0.1 ms the difference is less than 1%
1626          * of the step time.
1627          */
1628         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1629
1630         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1631         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1632                                    flags, eatLocal,
1633                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1634                                    fr->fshift);
1635         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1636
1637         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1638         {
1639             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1640             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1641             {
1642                 /* We measured few cycles, it could be that the kernel
1643                  * and transfer finished earlier and there was no actual
1644                  * wait time, only API call overhead.
1645                  * Then the actual time could be anywhere between 0 and
1646                  * cycles_wait_est. We will use half of cycles_wait_est.
1647                  */
1648                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1649             }
1650             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1651         }
1652     }
1653
1654     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1655     {
1656         // NOTE: emulation kernel is not included in the balancing region,
1657         // but emulation mode does not target performance anyway
1658         wallcycle_start_nocount(wcycle, ewcFORCE);
1659         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1660                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1661                      step, nrnb, wcycle);
1662         wallcycle_stop(wcycle, ewcFORCE);
1663     }
1664
1665     if (useGpuPme)
1666     {
1667         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1668     }
1669
1670     if (bUseGPU)
1671     {
1672         /* now clear the GPU outputs while we finish the step on the CPU */
1673         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1674         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1675         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1676
1677         /* Is dynamic pair-list pruning activated? */
1678         if (nbv->listParams->useDynamicPruning)
1679         {
1680             launchGpuRollingPruning(cr, nbv, inputrec, step);
1681         }
1682         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1683         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1684     }
1685
1686     /* Do the nonbonded GPU (or emulation) force buffer reduction
1687      * on the non-alternating path. */
1688     if (bUseOrEmulGPU && !alternateGpuWait)
1689     {
1690         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1691                                        nbv->nbat, f, wcycle);
1692     }
1693
1694     if (DOMAINDECOMP(cr))
1695     {
1696         dd_force_flop_stop(cr->dd, nrnb);
1697     }
1698
1699     if (bDoForces)
1700     {
1701         /* If we have NoVirSum forces, but we do not calculate the virial,
1702          * we sum fr->f_novirsum=f later.
1703          */
1704         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1705         {
1706             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1707                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1708         }
1709
1710         if (flags & GMX_FORCE_VIRIAL)
1711         {
1712             /* Calculation of the virial must be done after vsites! */
1713             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1714                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1715         }
1716     }
1717
1718     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1719     {
1720         /* In case of node-splitting, the PP nodes receive the long-range
1721          * forces, virial and energy from the PME nodes here.
1722          */
1723         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1724     }
1725
1726     if (bDoForces)
1727     {
1728         post_process_forces(cr, step, nrnb, wcycle,
1729                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1730                             vir_force, mdatoms, graph, fr, vsite,
1731                             flags);
1732     }
1733
1734     if (flags & GMX_FORCE_ENERGY)
1735     {
1736         /* Sum the potential energy terms from group contributions */
1737         sum_epot(&(enerd->grpp), enerd->term);
1738
1739         if (!EI_TPI(inputrec->eI))
1740         {
1741             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1742         }
1743     }
1744 }
1745
1746 static void do_force_cutsGROUP(FILE *fplog,
1747                                const t_commrec *cr,
1748                                const gmx_multisim_t *ms,
1749                                const t_inputrec *inputrec,
1750                                gmx::Awh *awh,
1751                                gmx_int64_t step,
1752                                t_nrnb *nrnb,
1753                                gmx_wallcycle_t wcycle,
1754                                gmx_localtop_t *top,
1755                                const gmx_groups_t *groups,
1756                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1757                                history_t *hist,
1758                                gmx::PaddedArrayRef<gmx::RVec> force,
1759                                tensor vir_force,
1760                                const t_mdatoms *mdatoms,
1761                                gmx_enerdata_t *enerd,
1762                                t_fcdata *fcd,
1763                                real *lambda,
1764                                t_graph *graph,
1765                                t_forcerec *fr,
1766                                const gmx_vsite_t *vsite,
1767                                rvec mu_tot,
1768                                double t,
1769                                const gmx_edsam *ed,
1770                                int flags,
1771                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1772                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1773 {
1774     int        cg0, cg1, i, j;
1775     double     mu[2*DIM];
1776     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1777     gmx_bool   bDoForces;
1778     float      cycles_pme;
1779
1780     const int  start  = 0;
1781     const int  homenr = mdatoms->homenr;
1782
1783     clear_mat(vir_force);
1784
1785     cg0 = 0;
1786     if (DOMAINDECOMP(cr))
1787     {
1788         cg1 = cr->dd->globalAtomGroupIndices.size();
1789     }
1790     else
1791     {
1792         cg1 = top->cgs.nr;
1793     }
1794     if (fr->n_tpi > 0)
1795     {
1796         cg1--;
1797     }
1798
1799     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1800     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1801     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1802     bFillGrid      = (bNS && bStateChanged);
1803     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1804     bDoForces      = (flags & GMX_FORCE_FORCES);
1805
1806     if (bStateChanged)
1807     {
1808         update_forcerec(fr, box);
1809
1810         if (inputrecNeedMutot(inputrec))
1811         {
1812             /* Calculate total (local) dipole moment in a temporary common array.
1813              * This makes it possible to sum them over nodes faster.
1814              */
1815             calc_mu(start, homenr,
1816                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1817                     mu, mu+DIM);
1818         }
1819     }
1820
1821     if (fr->ePBC != epbcNONE)
1822     {
1823         /* Compute shift vectors every step,
1824          * because of pressure coupling or box deformation!
1825          */
1826         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1827         {
1828             calc_shifts(box, fr->shift_vec);
1829         }
1830
1831         if (bCalcCGCM)
1832         {
1833             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1834                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1835             inc_nrnb(nrnb, eNR_CGCM, homenr);
1836             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1837         }
1838         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1839         {
1840             unshift_self(graph, box, as_rvec_array(x.data()));
1841         }
1842     }
1843     else if (bCalcCGCM)
1844     {
1845         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1846         inc_nrnb(nrnb, eNR_CGCM, homenr);
1847     }
1848
1849     if (bCalcCGCM && gmx_debug_at)
1850     {
1851         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1852     }
1853
1854 #if GMX_MPI
1855     if (!thisRankHasDuty(cr, DUTY_PME))
1856     {
1857         /* Send particle coordinates to the pme nodes.
1858          * Since this is only implemented for domain decomposition
1859          * and domain decomposition does not use the graph,
1860          * we do not need to worry about shifting.
1861          */
1862         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1863                                  lambda[efptCOUL], lambda[efptVDW],
1864                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1865                                  step, wcycle);
1866     }
1867 #endif /* GMX_MPI */
1868
1869     /* Communicate coordinates and sum dipole if necessary */
1870     if (DOMAINDECOMP(cr))
1871     {
1872         dd_move_x(cr->dd, box, x, wcycle);
1873
1874         /* No GPU support, no move_x overlap, so reopen the balance region here */
1875         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1876         {
1877             ddReopenBalanceRegionCpu(cr->dd);
1878         }
1879     }
1880
1881     if (inputrecNeedMutot(inputrec))
1882     {
1883         if (bStateChanged)
1884         {
1885             if (PAR(cr))
1886             {
1887                 gmx_sumd(2*DIM, mu, cr);
1888                 ddReopenBalanceRegionCpu(cr->dd);
1889             }
1890             for (i = 0; i < 2; i++)
1891             {
1892                 for (j = 0; j < DIM; j++)
1893                 {
1894                     fr->mu_tot[i][j] = mu[i*DIM + j];
1895                 }
1896             }
1897         }
1898         if (fr->efep == efepNO)
1899         {
1900             copy_rvec(fr->mu_tot[0], mu_tot);
1901         }
1902         else
1903         {
1904             for (j = 0; j < DIM; j++)
1905             {
1906                 mu_tot[j] =
1907                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1908             }
1909         }
1910     }
1911
1912     /* Reset energies */
1913     reset_enerdata(enerd);
1914     clear_rvecs(SHIFTS, fr->fshift);
1915
1916     if (bNS)
1917     {
1918         wallcycle_start(wcycle, ewcNS);
1919
1920         if (graph && bStateChanged)
1921         {
1922             /* Calculate intramolecular shift vectors to make molecules whole */
1923             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1924         }
1925
1926         /* Do the actual neighbour searching */
1927         ns(fplog, fr, box,
1928            groups, top, mdatoms,
1929            cr, nrnb, bFillGrid);
1930
1931         wallcycle_stop(wcycle, ewcNS);
1932     }
1933
1934     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1935     {
1936         wallcycle_start(wcycle, ewcPPDURINGPME);
1937         dd_force_flop_start(cr->dd, nrnb);
1938     }
1939
1940     if (inputrec->bRot)
1941     {
1942         wallcycle_start(wcycle, ewcROT);
1943         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1944         wallcycle_stop(wcycle, ewcROT);
1945     }
1946
1947     /* Temporary solution until all routines take PaddedRVecVector */
1948     rvec *f = as_rvec_array(force.data());
1949
1950     /* Start the force cycle counter.
1951      * Note that a different counter is used for dynamic load balancing.
1952      */
1953     wallcycle_start(wcycle, ewcFORCE);
1954
1955     gmx::ArrayRef<gmx::RVec> forceRef = force;
1956     if (bDoForces)
1957     {
1958         /* If we need to compute the virial, we might need a separate
1959          * force buffer for algorithms for which the virial is calculated
1960          * separately, such as PME.
1961          */
1962         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1963         {
1964             forceRef = *fr->forceBufferForDirectVirialContributions;
1965
1966             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1967         }
1968
1969         /* Clear the short- and long-range forces */
1970         clear_rvecs(fr->natoms_force_constr, f);
1971     }
1972
1973     /* forceWithVirial might need the full force atom range */
1974     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1975
1976     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1977     {
1978         clear_pull_forces(inputrec->pull_work);
1979     }
1980
1981     /* update QMMMrec, if necessary */
1982     if (fr->bQMMM)
1983     {
1984         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1985     }
1986
1987     /* Compute the bonded and non-bonded energies and optionally forces */
1988     do_force_lowlevel(fr, inputrec, &(top->idef),
1989                       cr, ms, nrnb, wcycle, mdatoms,
1990                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1991                       box, inputrec->fepvals, lambda,
1992                       graph, &(top->excls), fr->mu_tot,
1993                       flags,
1994                       &cycles_pme);
1995
1996     wallcycle_stop(wcycle, ewcFORCE);
1997
1998     if (DOMAINDECOMP(cr))
1999     {
2000         dd_force_flop_stop(cr->dd, nrnb);
2001
2002         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2003         {
2004             ddCloseBalanceRegionCpu(cr->dd);
2005         }
2006     }
2007
2008     computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
2009                          fr->forceProviders, box, x, mdatoms, lambda,
2010                          flags, &forceWithVirial, enerd,
2011                          ed, bNS);
2012
2013     if (bDoForces)
2014     {
2015         /* Communicate the forces */
2016         if (DOMAINDECOMP(cr))
2017         {
2018             dd_move_f(cr->dd, force, fr->fshift, wcycle);
2019             /* Do we need to communicate the separate force array
2020              * for terms that do not contribute to the single sum virial?
2021              * Position restraints and electric fields do not introduce
2022              * inter-cg forces, only full electrostatics methods do.
2023              * When we do not calculate the virial, fr->f_novirsum = f,
2024              * so we have already communicated these forces.
2025              */
2026             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2027                 (flags & GMX_FORCE_VIRIAL))
2028             {
2029                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2030             }
2031         }
2032
2033         /* If we have NoVirSum forces, but we do not calculate the virial,
2034          * we sum fr->f_novirsum=f later.
2035          */
2036         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2037         {
2038             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2039                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2040         }
2041
2042         if (flags & GMX_FORCE_VIRIAL)
2043         {
2044             /* Calculation of the virial must be done after vsites! */
2045             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2046                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2047         }
2048     }
2049
2050     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2051     {
2052         /* In case of node-splitting, the PP nodes receive the long-range
2053          * forces, virial and energy from the PME nodes here.
2054          */
2055         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2056     }
2057
2058     if (bDoForces)
2059     {
2060         post_process_forces(cr, step, nrnb, wcycle,
2061                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2062                             vir_force, mdatoms, graph, fr, vsite,
2063                             flags);
2064     }
2065
2066     if (flags & GMX_FORCE_ENERGY)
2067     {
2068         /* Sum the potential energy terms from group contributions */
2069         sum_epot(&(enerd->grpp), enerd->term);
2070
2071         if (!EI_TPI(inputrec->eI))
2072         {
2073             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2074         }
2075     }
2076
2077 }
2078
2079 void do_force(FILE                                     *fplog,
2080               const t_commrec                          *cr,
2081               const gmx_multisim_t                     *ms,
2082               const t_inputrec                         *inputrec,
2083               gmx::Awh                                 *awh,
2084               gmx_int64_t                               step,
2085               t_nrnb                                   *nrnb,
2086               gmx_wallcycle_t                           wcycle,
2087               gmx_localtop_t                           *top,
2088               const gmx_groups_t                       *groups,
2089               matrix                                    box,
2090               gmx::PaddedArrayRef<gmx::RVec>            x,
2091               history_t                                *hist,
2092               gmx::PaddedArrayRef<gmx::RVec>            force,
2093               tensor                                    vir_force,
2094               const t_mdatoms                          *mdatoms,
2095               gmx_enerdata_t                           *enerd,
2096               t_fcdata                                 *fcd,
2097               gmx::ArrayRef<real>                       lambda,
2098               t_graph                                  *graph,
2099               t_forcerec                               *fr,
2100               const gmx_vsite_t                        *vsite,
2101               rvec                                      mu_tot,
2102               double                                    t,
2103               const gmx_edsam                          *ed,
2104               int                                       flags,
2105               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2106               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2107 {
2108     /* modify force flag if not doing nonbonded */
2109     if (!fr->bNonbonded)
2110     {
2111         flags &= ~GMX_FORCE_NONBONDED;
2112     }
2113
2114     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2115     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2116
2117     switch (inputrec->cutoff_scheme)
2118     {
2119         case ecutsVERLET:
2120             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2121                                 awh, step, nrnb, wcycle,
2122                                 top,
2123                                 groups,
2124                                 box, x, hist,
2125                                 force, vir_force,
2126                                 mdatoms,
2127                                 enerd, fcd,
2128                                 lambda.data(), graph,
2129                                 fr, fr->ic,
2130                                 vsite, mu_tot,
2131                                 t, ed,
2132                                 flags,
2133                                 ddOpenBalanceRegion,
2134                                 ddCloseBalanceRegion);
2135             break;
2136         case ecutsGROUP:
2137             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2138                                awh, step, nrnb, wcycle,
2139                                top,
2140                                groups,
2141                                box, x, hist,
2142                                force, vir_force,
2143                                mdatoms,
2144                                enerd, fcd,
2145                                lambda.data(), graph,
2146                                fr, vsite, mu_tot,
2147                                t, ed,
2148                                flags,
2149                                ddOpenBalanceRegion,
2150                                ddCloseBalanceRegion);
2151             break;
2152         default:
2153             gmx_incons("Invalid cut-off scheme passed!");
2154     }
2155
2156     /* In case we don't have constraints and are using GPUs, the next balancing
2157      * region starts here.
2158      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2159      * virial calculation and COM pulling, is not thus not included in
2160      * the balance timing, which is ok as most tasks do communication.
2161      */
2162     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2163     {
2164         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2165     }
2166 }
2167
2168
2169 void do_constrain_first(FILE *fplog, Constraints *constr,
2170                         t_inputrec *ir, t_mdatoms *md,
2171                         t_state *state)
2172 {
2173     int             i, m, start, end;
2174     gmx_int64_t     step;
2175     real            dt = ir->delta_t;
2176     real            dvdl_dum;
2177     rvec           *savex;
2178
2179     /* We need to allocate one element extra, since we might use
2180      * (unaligned) 4-wide SIMD loads to access rvec entries.
2181      */
2182     snew(savex, state->natoms + 1);
2183
2184     start = 0;
2185     end   = md->homenr;
2186
2187     if (debug)
2188     {
2189         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2190                 start, md->homenr, end);
2191     }
2192     /* Do a first constrain to reset particles... */
2193     step = ir->init_step;
2194     if (fplog)
2195     {
2196         char buf[STEPSTRSIZE];
2197         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2198                 gmx_step_str(step, buf));
2199     }
2200     dvdl_dum = 0;
2201
2202     /* constrain the current position */
2203     constr->apply(TRUE, FALSE,
2204                   step, 0, 1.0,
2205                   as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2206                   state->box,
2207                   state->lambda[efptBONDED], &dvdl_dum,
2208                   nullptr, nullptr, ConstraintVariable::Positions);
2209     if (EI_VV(ir->eI))
2210     {
2211         /* constrain the inital velocity, and save it */
2212         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2213         constr->apply(TRUE, FALSE,
2214                       step, 0, 1.0,
2215                       as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2216                       state->box,
2217                       state->lambda[efptBONDED], &dvdl_dum,
2218                       nullptr, nullptr, ConstraintVariable::Velocities);
2219     }
2220     /* constrain the inital velocities at t-dt/2 */
2221     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2222     {
2223         for (i = start; (i < end); i++)
2224         {
2225             for (m = 0; (m < DIM); m++)
2226             {
2227                 /* Reverse the velocity */
2228                 state->v[i][m] = -state->v[i][m];
2229                 /* Store the position at t-dt in buf */
2230                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2231             }
2232         }
2233         /* Shake the positions at t=-dt with the positions at t=0
2234          * as reference coordinates.
2235          */
2236         if (fplog)
2237         {
2238             char buf[STEPSTRSIZE];
2239             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2240                     gmx_step_str(step, buf));
2241         }
2242         dvdl_dum = 0;
2243         constr->apply(TRUE, FALSE,
2244                       step, -1, 1.0,
2245                       as_rvec_array(state->x.data()), savex, nullptr,
2246                       state->box,
2247                       state->lambda[efptBONDED], &dvdl_dum,
2248                       as_rvec_array(state->v.data()), nullptr, ConstraintVariable::Positions);
2249
2250         for (i = start; i < end; i++)
2251         {
2252             for (m = 0; m < DIM; m++)
2253             {
2254                 /* Re-reverse the velocities */
2255                 state->v[i][m] = -state->v[i][m];
2256             }
2257         }
2258     }
2259     sfree(savex);
2260 }
2261
2262
2263 static void
2264 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2265                 double *enerout, double *virout)
2266 {
2267     double enersum, virsum;
2268     double invscale, invscale2, invscale3;
2269     double r, ea, eb, ec, pa, pb, pc, pd;
2270     double y0, f, g, h;
2271     int    ri, offset;
2272     double tabfactor;
2273
2274     invscale  = 1.0/scale;
2275     invscale2 = invscale*invscale;
2276     invscale3 = invscale*invscale2;
2277
2278     /* Following summation derived from cubic spline definition,
2279      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2280      * the cubic spline.  We first calculate the negative of the
2281      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2282      * add the more standard, abrupt cutoff correction to that result,
2283      * yielding the long-range correction for a switched function.  We
2284      * perform both the pressure and energy loops at the same time for
2285      * simplicity, as the computational cost is low. */
2286
2287     if (offstart == 0)
2288     {
2289         /* Since the dispersion table has been scaled down a factor
2290          * 6.0 and the repulsion a factor 12.0 to compensate for the
2291          * c6/c12 parameters inside nbfp[] being scaled up (to save
2292          * flops in kernels), we need to correct for this.
2293          */
2294         tabfactor = 6.0;
2295     }
2296     else
2297     {
2298         tabfactor = 12.0;
2299     }
2300
2301     enersum = 0.0;
2302     virsum  = 0.0;
2303     for (ri = rstart; ri < rend; ++ri)
2304     {
2305         r  = ri*invscale;
2306         ea = invscale3;
2307         eb = 2.0*invscale2*r;
2308         ec = invscale*r*r;
2309
2310         pa = invscale3;
2311         pb = 3.0*invscale2*r;
2312         pc = 3.0*invscale*r*r;
2313         pd = r*r*r;
2314
2315         /* this "8" is from the packing in the vdwtab array - perhaps
2316            should be defined? */
2317
2318         offset = 8*ri + offstart;
2319         y0     = vdwtab[offset];
2320         f      = vdwtab[offset+1];
2321         g      = vdwtab[offset+2];
2322         h      = vdwtab[offset+3];
2323
2324         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2325         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2326     }
2327     *enerout = 4.0*M_PI*enersum*tabfactor;
2328     *virout  = 4.0*M_PI*virsum*tabfactor;
2329 }
2330
2331 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2332 {
2333     double   eners[2], virs[2], enersum, virsum;
2334     double   r0, rc3, rc9;
2335     int      ri0, ri1, i;
2336     real     scale, *vdwtab;
2337
2338     fr->enershiftsix    = 0;
2339     fr->enershifttwelve = 0;
2340     fr->enerdiffsix     = 0;
2341     fr->enerdifftwelve  = 0;
2342     fr->virdiffsix      = 0;
2343     fr->virdifftwelve   = 0;
2344
2345     const interaction_const_t *ic = fr->ic;
2346
2347     if (eDispCorr != edispcNO)
2348     {
2349         for (i = 0; i < 2; i++)
2350         {
2351             eners[i] = 0;
2352             virs[i]  = 0;
2353         }
2354         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2355             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2356             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2357             (ic->vdwtype == evdwSHIFT) ||
2358             (ic->vdwtype == evdwSWITCH))
2359         {
2360             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2361                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2362                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2363             {
2364                 gmx_fatal(FARGS,
2365                           "With dispersion correction rvdw-switch can not be zero "
2366                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2367             }
2368
2369             /* TODO This code depends on the logic in tables.c that
2370                constructs the table layout, which should be made
2371                explicit in future cleanup. */
2372             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2373                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2374             scale  = fr->dispersionCorrectionTable->scale;
2375             vdwtab = fr->dispersionCorrectionTable->data;
2376
2377             /* Round the cut-offs to exact table values for precision */
2378             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2379             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2380
2381             /* The code below has some support for handling force-switching, i.e.
2382              * when the force (instead of potential) is switched over a limited
2383              * region. This leads to a constant shift in the potential inside the
2384              * switching region, which we can handle by adding a constant energy
2385              * term in the force-switch case just like when we do potential-shift.
2386              *
2387              * For now this is not enabled, but to keep the functionality in the
2388              * code we check separately for switch and shift. When we do force-switch
2389              * the shifting point is rvdw_switch, while it is the cutoff when we
2390              * have a classical potential-shift.
2391              *
2392              * For a pure potential-shift the potential has a constant shift
2393              * all the way out to the cutoff, and that is it. For other forms
2394              * we need to calculate the constant shift up to the point where we
2395              * start modifying the potential.
2396              */
2397             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2398
2399             r0   = ri0/scale;
2400             rc3  = r0*r0*r0;
2401             rc9  = rc3*rc3*rc3;
2402
2403             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2404                 (ic->vdwtype == evdwSHIFT))
2405             {
2406                 /* Determine the constant energy shift below rvdw_switch.
2407                  * Table has a scale factor since we have scaled it down to compensate
2408                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2409                  */
2410                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2411                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2412             }
2413             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2414             {
2415                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2416                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2417             }
2418
2419             /* Add the constant part from 0 to rvdw_switch.
2420              * This integration from 0 to rvdw_switch overcounts the number
2421              * of interactions by 1, as it also counts the self interaction.
2422              * We will correct for this later.
2423              */
2424             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2425             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2426
2427             /* Calculate the contribution in the range [r0,r1] where we
2428              * modify the potential. For a pure potential-shift modifier we will
2429              * have ri0==ri1, and there will not be any contribution here.
2430              */
2431             for (i = 0; i < 2; i++)
2432             {
2433                 enersum = 0;
2434                 virsum  = 0;
2435                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2436                 eners[i] -= enersum;
2437                 virs[i]  -= virsum;
2438             }
2439
2440             /* Alright: Above we compensated by REMOVING the parts outside r0
2441              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2442              *
2443              * Regardless of whether r0 is the point where we start switching,
2444              * or the cutoff where we calculated the constant shift, we include
2445              * all the parts we are missing out to infinity from r0 by
2446              * calculating the analytical dispersion correction.
2447              */
2448             eners[0] += -4.0*M_PI/(3.0*rc3);
2449             eners[1] +=  4.0*M_PI/(9.0*rc9);
2450             virs[0]  +=  8.0*M_PI/rc3;
2451             virs[1]  += -16.0*M_PI/(3.0*rc9);
2452         }
2453         else if (ic->vdwtype == evdwCUT ||
2454                  EVDW_PME(ic->vdwtype) ||
2455                  ic->vdwtype == evdwUSER)
2456         {
2457             if (ic->vdwtype == evdwUSER && fplog)
2458             {
2459                 fprintf(fplog,
2460                         "WARNING: using dispersion correction with user tables\n");
2461             }
2462
2463             /* Note that with LJ-PME, the dispersion correction is multiplied
2464              * by the difference between the actual C6 and the value of C6
2465              * that would produce the combination rule.
2466              * This means the normal energy and virial difference formulas
2467              * can be used here.
2468              */
2469
2470             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2471             rc9  = rc3*rc3*rc3;
2472             /* Contribution beyond the cut-off */
2473             eners[0] += -4.0*M_PI/(3.0*rc3);
2474             eners[1] +=  4.0*M_PI/(9.0*rc9);
2475             if (ic->vdw_modifier == eintmodPOTSHIFT)
2476             {
2477                 /* Contribution within the cut-off */
2478                 eners[0] += -4.0*M_PI/(3.0*rc3);
2479                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2480             }
2481             /* Contribution beyond the cut-off */
2482             virs[0]  +=  8.0*M_PI/rc3;
2483             virs[1]  += -16.0*M_PI/(3.0*rc9);
2484         }
2485         else
2486         {
2487             gmx_fatal(FARGS,
2488                       "Dispersion correction is not implemented for vdw-type = %s",
2489                       evdw_names[ic->vdwtype]);
2490         }
2491
2492         /* When we deprecate the group kernels the code below can go too */
2493         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2494         {
2495             /* Calculate self-interaction coefficient (assuming that
2496              * the reciprocal-space contribution is constant in the
2497              * region that contributes to the self-interaction).
2498              */
2499             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2500
2501             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2502             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2503         }
2504
2505         fr->enerdiffsix    = eners[0];
2506         fr->enerdifftwelve = eners[1];
2507         /* The 0.5 is due to the Gromacs definition of the virial */
2508         fr->virdiffsix     = 0.5*virs[0];
2509         fr->virdifftwelve  = 0.5*virs[1];
2510     }
2511 }
2512
2513 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2514                    matrix box, real lambda, tensor pres, tensor virial,
2515                    real *prescorr, real *enercorr, real *dvdlcorr)
2516 {
2517     gmx_bool bCorrAll, bCorrPres;
2518     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2519     int      m;
2520
2521     *prescorr = 0;
2522     *enercorr = 0;
2523     *dvdlcorr = 0;
2524
2525     clear_mat(virial);
2526     clear_mat(pres);
2527
2528     if (ir->eDispCorr != edispcNO)
2529     {
2530         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2531                      ir->eDispCorr == edispcAllEnerPres);
2532         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2533                      ir->eDispCorr == edispcAllEnerPres);
2534
2535         invvol = 1/det(box);
2536         if (fr->n_tpi)
2537         {
2538             /* Only correct for the interactions with the inserted molecule */
2539             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2540             ninter = fr->n_tpi;
2541         }
2542         else
2543         {
2544             dens   = fr->numAtomsForDispersionCorrection*invvol;
2545             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2546         }
2547
2548         if (ir->efep == efepNO)
2549         {
2550             avcsix    = fr->avcsix[0];
2551             avctwelve = fr->avctwelve[0];
2552         }
2553         else
2554         {
2555             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2556             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2557         }
2558
2559         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2560         *enercorr += avcsix*enerdiff;
2561         dvdlambda  = 0.0;
2562         if (ir->efep != efepNO)
2563         {
2564             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2565         }
2566         if (bCorrAll)
2567         {
2568             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2569             *enercorr += avctwelve*enerdiff;
2570             if (fr->efep != efepNO)
2571             {
2572                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2573             }
2574         }
2575
2576         if (bCorrPres)
2577         {
2578             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2579             if (ir->eDispCorr == edispcAllEnerPres)
2580             {
2581                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2582             }
2583             /* The factor 2 is because of the Gromacs virial definition */
2584             spres = -2.0*invvol*svir*PRESFAC;
2585
2586             for (m = 0; m < DIM; m++)
2587             {
2588                 virial[m][m] += svir;
2589                 pres[m][m]   += spres;
2590             }
2591             *prescorr += spres;
2592         }
2593
2594         /* Can't currently control when it prints, for now, just print when degugging */
2595         if (debug)
2596         {
2597             if (bCorrAll)
2598             {
2599                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2600                         avcsix, avctwelve);
2601             }
2602             if (bCorrPres)
2603             {
2604                 fprintf(debug,
2605                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2606                         *enercorr, spres, svir);
2607             }
2608             else
2609             {
2610                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2611             }
2612         }
2613
2614         if (fr->efep != efepNO)
2615         {
2616             *dvdlcorr += dvdlambda;
2617         }
2618     }
2619 }
2620
2621 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2622                             const gmx_mtop_t *mtop, rvec x[],
2623                             gmx_bool bFirst)
2624 {
2625     t_graph        *graph;
2626     int             as, mol;
2627
2628     if (bFirst && fplog)
2629     {
2630         fprintf(fplog, "Removing pbc first time\n");
2631     }
2632
2633     snew(graph, 1);
2634     as = 0;
2635     for (const gmx_molblock_t &molb : mtop->molblock)
2636     {
2637         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2638         if (moltype.atoms.nr == 1 ||
2639             (!bFirst && moltype.cgs.nr == 1))
2640         {
2641             /* Just one atom or charge group in the molecule, no PBC required */
2642             as += molb.nmol*moltype.atoms.nr;
2643         }
2644         else
2645         {
2646             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2647             mk_graph_ilist(nullptr, moltype.ilist,
2648                            0, moltype.atoms.nr, FALSE, FALSE, graph);
2649
2650             for (mol = 0; mol < molb.nmol; mol++)
2651             {
2652                 mk_mshift(fplog, graph, ePBC, box, x+as);
2653
2654                 shift_self(graph, box, x+as);
2655                 /* The molecule is whole now.
2656                  * We don't need the second mk_mshift call as in do_pbc_first,
2657                  * since we no longer need this graph.
2658                  */
2659
2660                 as += moltype.atoms.nr;
2661             }
2662             done_graph(graph);
2663         }
2664     }
2665     sfree(graph);
2666 }
2667
2668 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2669                        const gmx_mtop_t *mtop, rvec x[])
2670 {
2671     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2672 }
2673
2674 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2675                  const gmx_mtop_t *mtop, rvec x[])
2676 {
2677     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2678 }
2679
2680 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2681 {
2682     int t, nth;
2683     nth = gmx_omp_nthreads_get(emntDefault);
2684
2685 #pragma omp parallel for num_threads(nth) schedule(static)
2686     for (t = 0; t < nth; t++)
2687     {
2688         try
2689         {
2690             size_t natoms = x.size();
2691             size_t offset = (natoms*t    )/nth;
2692             size_t len    = (natoms*(t + 1))/nth - offset;
2693             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2694         }
2695         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2696     }
2697 }
2698
2699 // TODO This can be cleaned up a lot, and move back to runner.cpp
2700 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2701                 const t_inputrec *inputrec,
2702                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2703                 gmx_walltime_accounting_t walltime_accounting,
2704                 nonbonded_verlet_t *nbv,
2705                 gmx_pme_t *pme,
2706                 gmx_bool bWriteStat)
2707 {
2708     t_nrnb *nrnb_tot = nullptr;
2709     double  delta_t  = 0;
2710     double  nbfs     = 0, mflop = 0;
2711     double  elapsed_time,
2712             elapsed_time_over_all_ranks,
2713             elapsed_time_over_all_threads,
2714             elapsed_time_over_all_threads_over_all_ranks;
2715     /* Control whether it is valid to print a report. Only the
2716        simulation master may print, but it should not do so if the run
2717        terminated e.g. before a scheduled reset step. This is
2718        complicated by the fact that PME ranks are unaware of the
2719        reason why they were sent a pmerecvqxFINISH. To avoid
2720        communication deadlocks, we always do the communication for the
2721        report, even if we've decided not to write the report, because
2722        how long it takes to finish the run is not important when we've
2723        decided not to report on the simulation performance.
2724
2725        Further, we only report performance for dynamical integrators,
2726        because those are the only ones for which we plan to
2727        consider doing any optimizations. */
2728     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2729
2730     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2731     {
2732         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2733         printReport = false;
2734     }
2735
2736     if (cr->nnodes > 1)
2737     {
2738         snew(nrnb_tot, 1);
2739 #if GMX_MPI
2740         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2741                       cr->mpi_comm_mysim);
2742 #endif
2743     }
2744     else
2745     {
2746         nrnb_tot = nrnb;
2747     }
2748
2749     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2750     elapsed_time_over_all_ranks                  = elapsed_time;
2751     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2752     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2753 #if GMX_MPI
2754     if (cr->nnodes > 1)
2755     {
2756         /* reduce elapsed_time over all MPI ranks in the current simulation */
2757         MPI_Allreduce(&elapsed_time,
2758                       &elapsed_time_over_all_ranks,
2759                       1, MPI_DOUBLE, MPI_SUM,
2760                       cr->mpi_comm_mysim);
2761         elapsed_time_over_all_ranks /= cr->nnodes;
2762         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2763          * current simulation. */
2764         MPI_Allreduce(&elapsed_time_over_all_threads,
2765                       &elapsed_time_over_all_threads_over_all_ranks,
2766                       1, MPI_DOUBLE, MPI_SUM,
2767                       cr->mpi_comm_mysim);
2768     }
2769 #endif
2770
2771     if (printReport)
2772     {
2773         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2774     }
2775     if (cr->nnodes > 1)
2776     {
2777         sfree(nrnb_tot);
2778     }
2779
2780     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2781     {
2782         print_dd_statistics(cr, inputrec, fplog);
2783     }
2784
2785     /* TODO Move the responsibility for any scaling by thread counts
2786      * to the code that handled the thread region, so that there's a
2787      * mechanism to keep cycle counting working during the transition
2788      * to task parallelism. */
2789     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2790     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2791     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2792     auto cycle_sum(wallcycle_sum(cr, wcycle));
2793
2794     if (printReport)
2795     {
2796         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2797         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2798         if (pme_gpu_task_enabled(pme))
2799         {
2800             pme_gpu_get_timings(pme, &pme_gpu_timings);
2801         }
2802         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2803                         elapsed_time_over_all_ranks,
2804                         wcycle, cycle_sum,
2805                         nbnxn_gpu_timings,
2806                         &pme_gpu_timings);
2807
2808         if (EI_DYNAMICS(inputrec->eI))
2809         {
2810             delta_t = inputrec->delta_t;
2811         }
2812
2813         if (fplog)
2814         {
2815             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2816                        elapsed_time_over_all_ranks,
2817                        walltime_accounting_get_nsteps_done(walltime_accounting),
2818                        delta_t, nbfs, mflop);
2819         }
2820         if (bWriteStat)
2821         {
2822             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2823                        elapsed_time_over_all_ranks,
2824                        walltime_accounting_get_nsteps_done(walltime_accounting),
2825                        delta_t, nbfs, mflop);
2826         }
2827     }
2828 }
2829
2830 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2831 {
2832     /* this function works, but could probably use a logic rewrite to keep all the different
2833        types of efep straight. */
2834
2835     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2836     {
2837         return;
2838     }
2839
2840     t_lambda *fep = ir->fepvals;
2841     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2842                                             if checkpoint is set -- a kludge is in for now
2843                                             to prevent this.*/
2844
2845     for (int i = 0; i < efptNR; i++)
2846     {
2847         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2848         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2849         {
2850             lambda[i] = fep->init_lambda;
2851             if (lam0)
2852             {
2853                 lam0[i] = lambda[i];
2854             }
2855         }
2856         else
2857         {
2858             lambda[i] = fep->all_lambda[i][*fep_state];
2859             if (lam0)
2860             {
2861                 lam0[i] = lambda[i];
2862             }
2863         }
2864     }
2865     if (ir->bSimTemp)
2866     {
2867         /* need to rescale control temperatures to match current state */
2868         for (int i = 0; i < ir->opts.ngtc; i++)
2869         {
2870             if (ir->opts.ref_t[i] > 0)
2871             {
2872                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2873             }
2874         }
2875     }
2876
2877     /* Send to the log the information on the current lambdas */
2878     if (fplog != nullptr)
2879     {
2880         fprintf(fplog, "Initial vector of lambda components:[ ");
2881         for (const auto &l : lambda)
2882         {
2883             fprintf(fplog, "%10.4f ", l);
2884         }
2885         fprintf(fplog, "]\n");
2886     }
2887 }
2888
2889
2890 void init_md(FILE *fplog,
2891              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2892              t_inputrec *ir, const gmx_output_env_t *oenv,
2893              const MdrunOptions &mdrunOptions,
2894              double *t, double *t0,
2895              t_state *globalState, double *lam0,
2896              t_nrnb *nrnb, gmx_mtop_t *mtop,
2897              gmx_update_t **upd,
2898              gmx::BoxDeformation *deform,
2899              int nfile, const t_filenm fnm[],
2900              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2901              tensor force_vir, tensor shake_vir, rvec mu_tot,
2902              gmx_bool *bSimAnn, t_vcm **vcm,
2903              gmx_wallcycle_t wcycle)
2904 {
2905     int  i;
2906
2907     /* Initial values */
2908     *t = *t0       = ir->init_t;
2909
2910     *bSimAnn = FALSE;
2911     for (i = 0; i < ir->opts.ngtc; i++)
2912     {
2913         /* set bSimAnn if any group is being annealed */
2914         if (ir->opts.annealing[i] != eannNO)
2915         {
2916             *bSimAnn = TRUE;
2917         }
2918     }
2919
2920     /* Initialize lambda variables */
2921     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2922      * We currently need to call initialize_lambdas on non-master ranks
2923      * to initialize lam0.
2924      */
2925     if (MASTER(cr))
2926     {
2927         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2928     }
2929     else
2930     {
2931         int                      tmpFepState;
2932         std::array<real, efptNR> tmpLambda;
2933         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2934     }
2935
2936     // TODO upd is never NULL in practice, but the analysers don't know that
2937     if (upd)
2938     {
2939         *upd = init_update(ir, deform);
2940     }
2941     if (*bSimAnn)
2942     {
2943         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2944     }
2945
2946     if (vcm != nullptr)
2947     {
2948         *vcm = init_vcm(fplog, &mtop->groups, ir);
2949     }
2950
2951     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2952     {
2953         if (ir->etc == etcBERENDSEN)
2954         {
2955             please_cite(fplog, "Berendsen84a");
2956         }
2957         if (ir->etc == etcVRESCALE)
2958         {
2959             please_cite(fplog, "Bussi2007a");
2960         }
2961         if (ir->eI == eiSD1)
2962         {
2963             please_cite(fplog, "Goga2012");
2964         }
2965     }
2966     init_nrnb(nrnb);
2967
2968     if (nfile != -1)
2969     {
2970         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2971
2972         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2973                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2974     }
2975
2976     /* Initiate variables */
2977     clear_mat(force_vir);
2978     clear_mat(shake_vir);
2979     clear_rvec(mu_tot);
2980 }