6b32084ef8b9abd121a84f888e4fea53498187e6
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <assert.h>
44 #include <math.h>
45 #include <stdio.h>
46 #include <string.h>
47
48 #include <cstdint>
49
50 #include <array>
51
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
112
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
115 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
116
117 void print_time(FILE                     *out,
118                 gmx_walltime_accounting_t walltime_accounting,
119                 gmx_int64_t               step,
120                 t_inputrec               *ir,
121                 t_commrec gmx_unused     *cr)
122 {
123     time_t finish;
124     char   timebuf[STRLEN];
125     double dt, elapsed_seconds, time_per_step;
126     char   buf[48];
127
128 #if !GMX_THREAD_MPI
129     if (!PAR(cr))
130 #endif
131     {
132         fprintf(out, "\r");
133     }
134     fprintf(out, "step %s", gmx_step_str(step, buf));
135     fflush(out);
136
137     if ((step >= ir->nstlist))
138     {
139         double seconds_since_epoch = gmx_gettime();
140         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
141         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
142         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
143
144         if (ir->nsteps >= 0)
145         {
146             if (dt >= 300)
147             {
148                 finish = (time_t) (seconds_since_epoch + dt);
149                 gmx_ctime_r(&finish, timebuf, STRLEN);
150                 sprintf(buf, "%s", timebuf);
151                 buf[strlen(buf)-1] = '\0';
152                 fprintf(out, ", will finish %s", buf);
153             }
154             else
155             {
156                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
157             }
158         }
159         else
160         {
161             fprintf(out, " performance: %.1f ns/day    ",
162                     ir->delta_t/1000*24*60*60/time_per_step);
163         }
164     }
165 #if !GMX_THREAD_MPI
166     if (PAR(cr))
167     {
168         fprintf(out, "\n");
169     }
170 #endif
171
172     fflush(out);
173 }
174
175 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
176                          double the_time)
177 {
178     char   time_string[STRLEN];
179
180     if (!fplog)
181     {
182         return;
183     }
184
185     {
186         int    i;
187         char   timebuf[STRLEN];
188         time_t temp_time = (time_t) the_time;
189
190         gmx_ctime_r(&temp_time, timebuf, STRLEN);
191         for (i = 0; timebuf[i] >= ' '; i++)
192         {
193             time_string[i] = timebuf[i];
194         }
195         time_string[i] = '\0';
196     }
197
198     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
199 }
200
201 void print_start(FILE *fplog, t_commrec *cr,
202                  gmx_walltime_accounting_t walltime_accounting,
203                  const char *name)
204 {
205     char buf[STRLEN];
206
207     sprintf(buf, "Started %s", name);
208     print_date_and_time(fplog, cr->nodeid, buf,
209                         walltime_accounting_get_start_time_stamp(walltime_accounting));
210 }
211
212 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
213 {
214     /* TODO: remove this - 1 when padding is properly implemented */
215     int         end  = forceToAdd->size() - 1;
216     const rvec *fAdd = as_rvec_array(forceToAdd->data());
217
218     // cppcheck-suppress unreadVariable
219     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
220 #pragma omp parallel for num_threads(nt) schedule(static)
221     for (int i = 0; i < end; i++)
222     {
223         rvec_inc(f[i], fAdd[i]);
224     }
225 }
226
227 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
228                         tensor vir_part, t_graph *graph, matrix box,
229                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
230 {
231     int    i;
232
233     /* The short-range virial from surrounding boxes */
234     clear_mat(vir_part);
235     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
236     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
237
238     /* Calculate partial virial, for local atoms only, based on short range.
239      * Total virial is computed in global_stat, called from do_md
240      */
241     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
242     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
243
244     /* Add position restraint contribution */
245     for (i = 0; i < DIM; i++)
246     {
247         vir_part[i][i] += fr->vir_diag_posres[i];
248     }
249
250     /* Add wall contribution */
251     for (i = 0; i < DIM; i++)
252     {
253         vir_part[i][ZZ] += fr->vir_wall_z[i];
254     }
255
256     if (debug)
257     {
258         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
259     }
260 }
261
262 static void pull_potential_wrapper(t_commrec *cr,
263                                    t_inputrec *ir,
264                                    matrix box, rvec x[],
265                                    rvec f[],
266                                    tensor vir_force,
267                                    t_mdatoms *mdatoms,
268                                    gmx_enerdata_t *enerd,
269                                    real *lambda,
270                                    double t,
271                                    gmx_wallcycle_t wcycle)
272 {
273     t_pbc  pbc;
274     real   dvdl;
275
276     /* Calculate the center of mass forces, this requires communication,
277      * which is why pull_potential is called close to other communication.
278      * The virial contribution is calculated directly,
279      * which is why we call pull_potential after calc_virial.
280      */
281     wallcycle_start(wcycle, ewcPULLPOT);
282     set_pbc(&pbc, ir->ePBC, box);
283     dvdl                     = 0;
284     enerd->term[F_COM_PULL] +=
285         pull_potential(ir->pull_work, mdatoms, &pbc,
286                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
287     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
288     wallcycle_stop(wcycle, ewcPULLPOT);
289 }
290
291 static void pme_receive_force_ener(t_commrec      *cr,
292                                    gmx_wallcycle_t wcycle,
293                                    gmx_enerdata_t *enerd,
294                                    t_forcerec     *fr)
295 {
296     real   e_q, e_lj, dvdl_q, dvdl_lj;
297     float  cycles_ppdpme, cycles_seppme;
298
299     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
300     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
301
302     /* In case of node-splitting, the PP nodes receive the long-range
303      * forces, virial and energy from the PME nodes here.
304      */
305     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
306     dvdl_q  = 0;
307     dvdl_lj = 0;
308     gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
309                       fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
310                       &cycles_seppme);
311     enerd->term[F_COUL_RECIP] += e_q;
312     enerd->term[F_LJ_RECIP]   += e_lj;
313     enerd->dvdl_lin[efptCOUL] += dvdl_q;
314     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
315
316     if (wcycle)
317     {
318         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
319     }
320     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
321 }
322
323 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
324                                gmx_int64_t step, real forceTolerance,
325                                const rvec *x, const rvec *f)
326 {
327     real           force2Tolerance = gmx::square(forceTolerance);
328     std::uintmax_t numNonFinite    = 0;
329     for (int i = 0; i < md->homenr; i++)
330     {
331         real force2    = norm2(f[i]);
332         bool nonFinite = !std::isfinite(force2);
333         if (force2 >= force2Tolerance || nonFinite)
334         {
335             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
336                     step,
337                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
338         }
339         if (nonFinite)
340         {
341             numNonFinite++;
342         }
343     }
344     if (numNonFinite > 0)
345     {
346         /* Note that with MPI this fatal call on one rank might interrupt
347          * the printing on other ranks. But we can only avoid that with
348          * an expensive MPI barrier that we would need at each step.
349          */
350         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
351     }
352 }
353
354 static void post_process_forces(t_commrec *cr,
355                                 gmx_int64_t step,
356                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
357                                 gmx_localtop_t *top,
358                                 matrix box, rvec x[],
359                                 rvec f[],
360                                 tensor vir_force,
361                                 t_mdatoms *mdatoms,
362                                 t_graph *graph,
363                                 t_forcerec *fr, gmx_vsite_t *vsite,
364                                 int flags)
365 {
366     if (fr->bF_NoVirSum)
367     {
368         if (vsite)
369         {
370             /* Spread the mesh force on virtual sites to the other particles...
371              * This is parallellized. MPI communication is performed
372              * if the constructing atoms aren't local.
373              */
374             wallcycle_start(wcycle, ewcVSITESPREAD);
375             spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
376                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
377                            nrnb,
378                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
379             wallcycle_stop(wcycle, ewcVSITESPREAD);
380         }
381         if (flags & GMX_FORCE_VIRIAL)
382         {
383             /* Now add the forces, this is local */
384             sum_forces(f, fr->f_novirsum);
385
386             if (EEL_FULL(fr->eeltype))
387             {
388                 /* Add the mesh contribution to the virial */
389                 m_add(vir_force, fr->vir_el_recip, vir_force);
390             }
391             if (EVDW_PME(fr->vdwtype))
392             {
393                 /* Add the mesh contribution to the virial */
394                 m_add(vir_force, fr->vir_lj_recip, vir_force);
395             }
396             if (debug)
397             {
398                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
399             }
400         }
401     }
402
403     if (fr->print_force >= 0)
404     {
405         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
406     }
407 }
408
409 static void do_nb_verlet(t_forcerec *fr,
410                          interaction_const_t *ic,
411                          gmx_enerdata_t *enerd,
412                          int flags, int ilocality,
413                          int clearF,
414                          gmx_int64_t step,
415                          t_nrnb *nrnb,
416                          gmx_wallcycle_t wcycle)
417 {
418     if (!(flags & GMX_FORCE_NONBONDED))
419     {
420         /* skip non-bonded calculation */
421         return;
422     }
423
424     nonbonded_verlet_t       *nbv  = fr->nbv;
425     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
426
427     /* GPU kernel launch overhead is already timed separately */
428     if (fr->cutoff_scheme != ecutsVERLET)
429     {
430         gmx_incons("Invalid cut-off scheme passed!");
431     }
432
433     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
434
435     if (!bUsingGpuKernels)
436     {
437         /* When dynamic pair-list  pruning is requested, we need to prune
438          * at nstlistPrune steps.
439          */
440         if (nbv->listParams->useDynamicPruning &&
441             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
442         {
443             /* Prune the pair-list beyond fr->ic->rlistPrune using
444              * the current coordinates of the atoms.
445              */
446             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
447             nbnxn_kernel_cpu_prune(nbvg, fr->shift_vec, nbv->listParams->rlistInner);
448             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
449         }
450
451         wallcycle_sub_start(wcycle, ewcsNONBONDED);
452     }
453
454     switch (nbvg->kernel_type)
455     {
456         case nbnxnk4x4_PlainC:
457         case nbnxnk4xN_SIMD_4xN:
458         case nbnxnk4xN_SIMD_2xNN:
459             nbnxn_kernel_cpu(nbvg,
460                              ic,
461                              fr->shift_vec,
462                              flags,
463                              clearF,
464                              fr->fshift[0],
465                              enerd->grpp.ener[egCOULSR],
466                              fr->bBHAM ?
467                              enerd->grpp.ener[egBHAMSR] :
468                              enerd->grpp.ener[egLJSR]);
469             break;
470
471         case nbnxnk8x8x8_GPU:
472             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
473             break;
474
475         case nbnxnk8x8x8_PlainC:
476             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
477                                  nbvg->nbat, ic,
478                                  fr->shift_vec,
479                                  flags,
480                                  clearF,
481                                  nbvg->nbat->out[0].f,
482                                  fr->fshift[0],
483                                  enerd->grpp.ener[egCOULSR],
484                                  fr->bBHAM ?
485                                  enerd->grpp.ener[egBHAMSR] :
486                                  enerd->grpp.ener[egLJSR]);
487             break;
488
489         default:
490             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
491
492     }
493     if (!bUsingGpuKernels)
494     {
495         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
496     }
497
498     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
499     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
500     {
501         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
502     }
503     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
504              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
505     {
506         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
507     }
508     else
509     {
510         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
511     }
512     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
513     if (flags & GMX_FORCE_ENERGY)
514     {
515         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
516         enr_nbnxn_kernel_ljc += 1;
517         enr_nbnxn_kernel_lj  += 1;
518     }
519
520     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
521              nbvg->nbl_lists.natpair_ljq);
522     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
523              nbvg->nbl_lists.natpair_lj);
524     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
525     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
526              nbvg->nbl_lists.natpair_q);
527
528     if (ic->vdw_modifier == eintmodFORCESWITCH)
529     {
530         /* We add up the switch cost separately */
531         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
532                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
533     }
534     if (ic->vdw_modifier == eintmodPOTSWITCH)
535     {
536         /* We add up the switch cost separately */
537         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
538                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
539     }
540     if (ic->vdwtype == evdwPME)
541     {
542         /* We add up the LJ Ewald cost separately */
543         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
544                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
545     }
546 }
547
548 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
549                              t_forcerec           *fr,
550                              rvec                  x[],
551                              rvec                  f[],
552                              t_mdatoms            *mdatoms,
553                              t_lambda             *fepvals,
554                              real                 *lambda,
555                              gmx_enerdata_t       *enerd,
556                              int                   flags,
557                              t_nrnb               *nrnb,
558                              gmx_wallcycle_t       wcycle)
559 {
560     int              donb_flags;
561     nb_kernel_data_t kernel_data;
562     real             lam_i[efptNR];
563     real             dvdl_nb[efptNR];
564     int              th;
565     int              i, j;
566
567     donb_flags = 0;
568     /* Add short-range interactions */
569     donb_flags |= GMX_NONBONDED_DO_SR;
570
571     /* Currently all group scheme kernels always calculate (shift-)forces */
572     if (flags & GMX_FORCE_FORCES)
573     {
574         donb_flags |= GMX_NONBONDED_DO_FORCE;
575     }
576     if (flags & GMX_FORCE_VIRIAL)
577     {
578         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
579     }
580     if (flags & GMX_FORCE_ENERGY)
581     {
582         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
583     }
584
585     kernel_data.flags  = donb_flags;
586     kernel_data.lambda = lambda;
587     kernel_data.dvdl   = dvdl_nb;
588
589     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
590     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
591
592     /* reset free energy components */
593     for (i = 0; i < efptNR; i++)
594     {
595         dvdl_nb[i]  = 0;
596     }
597
598     assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
599
600     wallcycle_sub_start(wcycle, ewcsNONBONDED);
601 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
602     for (th = 0; th < nbl_lists->nnbl; th++)
603     {
604         try
605         {
606             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
607                                       x, f, fr, mdatoms, &kernel_data, nrnb);
608         }
609         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
610     }
611
612     if (fepvals->sc_alpha != 0)
613     {
614         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
615         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
616     }
617     else
618     {
619         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
620         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
621     }
622
623     /* If we do foreign lambda and we have soft-core interactions
624      * we have to recalculate the (non-linear) energies contributions.
625      */
626     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
627     {
628         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
629         kernel_data.lambda         = lam_i;
630         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
631         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
632         /* Note that we add to kernel_data.dvdl, but ignore the result */
633
634         for (i = 0; i < enerd->n_lambda; i++)
635         {
636             for (j = 0; j < efptNR; j++)
637             {
638                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
639             }
640             reset_foreign_enerdata(enerd);
641 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
642             for (th = 0; th < nbl_lists->nnbl; th++)
643             {
644                 try
645                 {
646                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
647                                               x, f, fr, mdatoms, &kernel_data, nrnb);
648                 }
649                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
650             }
651
652             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
653             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
654         }
655     }
656
657     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
658 }
659
660 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
661 {
662     return nbv != nullptr && nbv->bUseGPU;
663 }
664
665 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
666 {
667     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
668
669     /* Note that we would like to avoid this conditional by putting it
670      * into the omp pragma instead, but then we still take the full
671      * omp parallel for overhead (at least with gcc5).
672      */
673     if (nth == 1)
674     {
675         for (int i = 0; i < n; i++)
676         {
677             clear_rvec(v[i]);
678         }
679     }
680     else
681     {
682 #pragma omp parallel for num_threads(nth) schedule(static)
683         for (int i = 0; i < n; i++)
684         {
685             clear_rvec(v[i]);
686         }
687     }
688 }
689
690 /*! \brief  This routine checks if the potential energy is finite.
691  *
692  * Note that passing this check does not guarantee finite forces,
693  * since those use slightly different arithmetics. But in most cases
694  * there is just a narrow coordinate range where forces are not finite
695  * and energies are finite.
696  *
697  * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
698  */
699 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
700 {
701     if (!std::isfinite(enerd->term[F_EPOT]))
702     {
703         gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
704                   enerd->term[F_EPOT],
705                   enerd->term[F_LJ],
706                   enerd->term[F_COUL_SR]);
707     }
708 }
709
710 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
711                          t_inputrec *inputrec,
712                          gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
713                          gmx_localtop_t *top,
714                          gmx_groups_t gmx_unused *groups,
715                          matrix box, rvec x[], history_t *hist,
716                          PaddedRVecVector *force,
717                          tensor vir_force,
718                          t_mdatoms *mdatoms,
719                          gmx_enerdata_t *enerd, t_fcdata *fcd,
720                          real *lambda, t_graph *graph,
721                          t_forcerec *fr, interaction_const_t *ic,
722                          gmx_vsite_t *vsite, rvec mu_tot,
723                          double t, gmx_edsam_t ed,
724                          gmx_bool bBornRadii,
725                          int flags,
726                          DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
727                          DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
728 {
729     int                 cg1, i, j;
730     double              mu[2*DIM];
731     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
732     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
733     gmx_bool            bDiffKernels = FALSE;
734     rvec                vzero, box_diag;
735     float               cycles_pme, cycles_wait_gpu;
736     nonbonded_verlet_t *nbv = fr->nbv;
737
738     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
739     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
740     bFillGrid     = (bNS && bStateChanged);
741     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
742     bDoForces     = (flags & GMX_FORCE_FORCES);
743     bUseGPU       = fr->nbv->bUseGPU;
744     bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu;
745
746     /* At a search step we need to start the first balancing region
747      * somewhere early inside the step after communication during domain
748      * decomposition (and not during the previous step as usual).
749      */
750     if (bNS &&
751         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
752     {
753         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
754     }
755
756     cycles_wait_gpu = 0;
757
758     const int start  = 0;
759     const int homenr = mdatoms->homenr;
760
761     clear_mat(vir_force);
762
763     if (DOMAINDECOMP(cr))
764     {
765         cg1 = cr->dd->ncg_tot;
766     }
767     else
768     {
769         cg1 = top->cgs.nr;
770     }
771     if (fr->n_tpi > 0)
772     {
773         cg1--;
774     }
775
776     if (bStateChanged)
777     {
778         update_forcerec(fr, box);
779
780         if (inputrecNeedMutot(inputrec))
781         {
782             /* Calculate total (local) dipole moment in a temporary common array.
783              * This makes it possible to sum them over nodes faster.
784              */
785             calc_mu(start, homenr,
786                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
787                     mu, mu+DIM);
788         }
789     }
790
791     if (fr->ePBC != epbcNONE)
792     {
793         /* Compute shift vectors every step,
794          * because of pressure coupling or box deformation!
795          */
796         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
797         {
798             calc_shifts(box, fr->shift_vec);
799         }
800
801         if (bCalcCGCM)
802         {
803             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
804             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
805         }
806         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
807         {
808             unshift_self(graph, box, x);
809         }
810     }
811
812     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
813                                  fr->shift_vec, nbv->grp[0].nbat);
814
815 #if GMX_MPI
816     if (!(cr->duty & DUTY_PME))
817     {
818         gmx_bool bBS;
819         matrix   boxs;
820
821         /* Send particle coordinates to the pme nodes.
822          * Since this is only implemented for domain decomposition
823          * and domain decomposition does not use the graph,
824          * we do not need to worry about shifting.
825          */
826
827         wallcycle_start(wcycle, ewcPP_PMESENDX);
828
829         bBS = (inputrec->nwall == 2);
830         if (bBS)
831         {
832             copy_mat(box, boxs);
833             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
834         }
835
836         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
837                                  lambda[efptCOUL], lambda[efptVDW],
838                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
839                                  step);
840
841         wallcycle_stop(wcycle, ewcPP_PMESENDX);
842     }
843 #endif /* GMX_MPI */
844
845     /* do gridding for pair search */
846     if (bNS)
847     {
848         if (graph && bStateChanged)
849         {
850             /* Calculate intramolecular shift vectors to make molecules whole */
851             mk_mshift(fplog, graph, fr->ePBC, box, x);
852         }
853
854         clear_rvec(vzero);
855         box_diag[XX] = box[XX][XX];
856         box_diag[YY] = box[YY][YY];
857         box_diag[ZZ] = box[ZZ][ZZ];
858
859         wallcycle_start(wcycle, ewcNS);
860         if (!fr->bDomDec)
861         {
862             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
863             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
864                               0, vzero, box_diag,
865                               0, mdatoms->homenr, -1, fr->cginfo, x,
866                               0, nullptr,
867                               nbv->grp[eintLocal].kernel_type,
868                               nbv->grp[eintLocal].nbat);
869             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
870         }
871         else
872         {
873             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
874             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
875                                        fr->cginfo, x,
876                                        nbv->grp[eintNonlocal].kernel_type,
877                                        nbv->grp[eintNonlocal].nbat);
878             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
879         }
880
881         if (nbv->ngrp == 1 ||
882             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
883         {
884             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
885                                nbv->nbs, mdatoms, fr->cginfo);
886         }
887         else
888         {
889             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
890                                nbv->nbs, mdatoms, fr->cginfo);
891             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
892                                nbv->nbs, mdatoms, fr->cginfo);
893         }
894         wallcycle_stop(wcycle, ewcNS);
895     }
896
897     /* initialize the GPU atom data and copy shift vector */
898     if (bUseGPU)
899     {
900         if (bNS)
901         {
902             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
903             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
904             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
905         }
906
907         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
908         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
909         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
910     }
911
912     /* do local pair search */
913     if (bNS)
914     {
915         wallcycle_start_nocount(wcycle, ewcNS);
916         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
917         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
918                             &top->excls,
919                             nbv->listParams->rlistOuter,
920                             nbv->min_ci_balanced,
921                             &nbv->grp[eintLocal].nbl_lists,
922                             eintLocal,
923                             nbv->grp[eintLocal].kernel_type,
924                             nrnb);
925         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
926         if (nbv->listParams->useDynamicPruning && !bUseGPU)
927         {
928             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
929         }
930         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
931
932         if (bUseGPU)
933         {
934             /* initialize local pair-list on the GPU */
935             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
936                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
937                                     eintLocal);
938         }
939         wallcycle_stop(wcycle, ewcNS);
940     }
941     else
942     {
943         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
944         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
945         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
946                                         nbv->grp[eintLocal].nbat);
947         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
948         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
949     }
950
951     if (bUseGPU)
952     {
953         if (DOMAINDECOMP(cr))
954         {
955             ddOpenBalanceRegionGpu(cr->dd);
956         }
957
958         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
959         /* launch local nonbonded F on GPU */
960         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
961                      step, nrnb, wcycle);
962         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
963     }
964
965     /* Communicate coordinates and sum dipole if necessary +
966        do non-local pair search */
967     if (DOMAINDECOMP(cr))
968     {
969         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
970                         nbv->grp[eintLocal].kernel_type);
971
972         if (bDiffKernels)
973         {
974             /* With GPU+CPU non-bonded calculations we need to copy
975              * the local coordinates to the non-local nbat struct
976              * (in CPU format) as the non-local kernel call also
977              * calculates the local - non-local interactions.
978              */
979             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
980             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
981             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
982                                             nbv->grp[eintNonlocal].nbat);
983             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
984             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
985         }
986
987         if (bNS)
988         {
989             wallcycle_start_nocount(wcycle, ewcNS);
990             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
991
992             if (bDiffKernels)
993             {
994                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
995             }
996
997             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
998                                 &top->excls,
999                                 nbv->listParams->rlistOuter,
1000                                 nbv->min_ci_balanced,
1001                                 &nbv->grp[eintNonlocal].nbl_lists,
1002                                 eintNonlocal,
1003                                 nbv->grp[eintNonlocal].kernel_type,
1004                                 nrnb);
1005             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1006             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1007             {
1008                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1009             }
1010             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1011
1012             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1013             {
1014                 /* initialize non-local pair-list on the GPU */
1015                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1016                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1017                                         eintNonlocal);
1018             }
1019             wallcycle_stop(wcycle, ewcNS);
1020         }
1021         else
1022         {
1023             wallcycle_start(wcycle, ewcMOVEX);
1024             dd_move_x(cr->dd, box, x);
1025             wallcycle_stop(wcycle, ewcMOVEX);
1026
1027             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1028             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1029             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1030                                             nbv->grp[eintNonlocal].nbat);
1031             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1032             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1033         }
1034
1035         if (bUseGPU && !bDiffKernels)
1036         {
1037             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1038             /* launch non-local nonbonded F on GPU */
1039             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1040                          step, nrnb, wcycle);
1041             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1042         }
1043     }
1044
1045     if (bUseGPU)
1046     {
1047         /* launch D2H copy-back F */
1048         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1049         if (DOMAINDECOMP(cr) && !bDiffKernels)
1050         {
1051             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1052                                      flags, eatNonlocal);
1053         }
1054         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1055                                  flags, eatLocal);
1056         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1057     }
1058
1059     if (bStateChanged && inputrecNeedMutot(inputrec))
1060     {
1061         if (PAR(cr))
1062         {
1063             gmx_sumd(2*DIM, mu, cr);
1064             ddReopenBalanceRegionCpu(cr->dd);
1065         }
1066
1067         for (i = 0; i < 2; i++)
1068         {
1069             for (j = 0; j < DIM; j++)
1070             {
1071                 fr->mu_tot[i][j] = mu[i*DIM + j];
1072             }
1073         }
1074     }
1075     if (fr->efep == efepNO)
1076     {
1077         copy_rvec(fr->mu_tot[0], mu_tot);
1078     }
1079     else
1080     {
1081         for (j = 0; j < DIM; j++)
1082         {
1083             mu_tot[j] =
1084                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1085                 lambda[efptCOUL]*fr->mu_tot[1][j];
1086         }
1087     }
1088
1089     /* Reset energies */
1090     reset_enerdata(enerd);
1091     clear_rvecs(SHIFTS, fr->fshift);
1092
1093     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1094     {
1095         wallcycle_start(wcycle, ewcPPDURINGPME);
1096         dd_force_flop_start(cr->dd, nrnb);
1097     }
1098
1099     if (inputrec->bRot)
1100     {
1101         wallcycle_start(wcycle, ewcROT);
1102         do_rotation(cr, inputrec, box, x, t, step, bNS);
1103         wallcycle_stop(wcycle, ewcROT);
1104     }
1105
1106     /* Temporary solution until all routines take PaddedRVecVector */
1107     rvec *f = as_rvec_array(force->data());
1108
1109     /* Start the force cycle counter.
1110      * Note that a different counter is used for dynamic load balancing.
1111      */
1112     wallcycle_start(wcycle, ewcFORCE);
1113     if (bDoForces)
1114     {
1115         /* Reset forces for which the virial is calculated separately:
1116          * PME/Ewald forces if necessary */
1117         if (fr->bF_NoVirSum)
1118         {
1119             if (flags & GMX_FORCE_VIRIAL)
1120             {
1121                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1122             }
1123             else
1124             {
1125                 /* We are not calculating the pressure so we do not need
1126                  * a separate array for forces that do not contribute
1127                  * to the pressure.
1128                  */
1129                 fr->f_novirsum = force;
1130             }
1131         }
1132
1133         if (fr->bF_NoVirSum)
1134         {
1135             if (flags & GMX_FORCE_VIRIAL)
1136             {
1137                 /* TODO: remove this - 1 when padding is properly implemented */
1138                 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1139                                 as_rvec_array(fr->f_novirsum->data()));
1140             }
1141         }
1142         /* Clear the short- and long-range forces */
1143         clear_rvecs_omp(fr->natoms_force_constr, f);
1144
1145         clear_rvec(fr->vir_diag_posres);
1146     }
1147
1148     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1149     {
1150         clear_pull_forces(inputrec->pull_work);
1151     }
1152
1153     /* We calculate the non-bonded forces, when done on the CPU, here.
1154      * We do this before calling do_force_lowlevel, because in that
1155      * function, the listed forces are calculated before PME, which
1156      * does communication.  With this order, non-bonded and listed
1157      * force calculation imbalance can be balanced out by the domain
1158      * decomposition load balancing.
1159      */
1160
1161     if (!bUseOrEmulGPU)
1162     {
1163         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1164                      step, nrnb, wcycle);
1165     }
1166
1167     if (fr->efep != efepNO)
1168     {
1169         /* Calculate the local and non-local free energy interactions here.
1170          * Happens here on the CPU both with and without GPU.
1171          */
1172         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1173         {
1174             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1175                              fr, x, f, mdatoms,
1176                              inputrec->fepvals, lambda,
1177                              enerd, flags, nrnb, wcycle);
1178         }
1179
1180         if (DOMAINDECOMP(cr) &&
1181             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1182         {
1183             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1184                              fr, x, f, mdatoms,
1185                              inputrec->fepvals, lambda,
1186                              enerd, flags, nrnb, wcycle);
1187         }
1188     }
1189
1190     if (!bUseOrEmulGPU || bDiffKernels)
1191     {
1192         int aloc;
1193
1194         if (DOMAINDECOMP(cr))
1195         {
1196             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1197                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1198                          step, nrnb, wcycle);
1199         }
1200
1201         if (!bUseOrEmulGPU)
1202         {
1203             aloc = eintLocal;
1204         }
1205         else
1206         {
1207             aloc = eintNonlocal;
1208         }
1209
1210         /* Add all the non-bonded force to the normal force array.
1211          * This can be split into a local and a non-local part when overlapping
1212          * communication with calculation with domain decomposition.
1213          */
1214         wallcycle_stop(wcycle, ewcFORCE);
1215         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1216         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1217         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1218         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1219         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1220         wallcycle_start_nocount(wcycle, ewcFORCE);
1221
1222         /* if there are multiple fshift output buffers reduce them */
1223         if ((flags & GMX_FORCE_VIRIAL) &&
1224             nbv->grp[aloc].nbl_lists.nnbl > 1)
1225         {
1226             /* This is not in a subcounter because it takes a
1227                negligible and constant-sized amount of time */
1228             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1229                                                      fr->fshift);
1230         }
1231     }
1232
1233     /* update QMMMrec, if necessary */
1234     if (fr->bQMMM)
1235     {
1236         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1237     }
1238
1239     /* Compute the bonded and non-bonded energies and optionally forces */
1240     do_force_lowlevel(fr, inputrec, &(top->idef),
1241                       cr, nrnb, wcycle, mdatoms,
1242                       x, hist, f, enerd, fcd, top, fr->born,
1243                       bBornRadii, box,
1244                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1245                       flags, &cycles_pme);
1246
1247     wallcycle_stop(wcycle, ewcFORCE);
1248
1249     if (ed)
1250     {
1251         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1252     }
1253
1254     if (bUseOrEmulGPU && !bDiffKernels)
1255     {
1256         /* wait for non-local forces (or calculate in emulation mode) */
1257         if (DOMAINDECOMP(cr))
1258         {
1259             if (bUseGPU)
1260             {
1261                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1262                 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1263                                        flags, eatNonlocal,
1264                                        enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1265                                        fr->fshift);
1266                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1267             }
1268             else
1269             {
1270                 wallcycle_start_nocount(wcycle, ewcFORCE);
1271                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1272                              step, nrnb, wcycle);
1273                 wallcycle_stop(wcycle, ewcFORCE);
1274             }
1275             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1276             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1277             /* skip the reduction if there was no non-local work to do */
1278             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1279             {
1280                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1281                                                nbv->grp[eintNonlocal].nbat, f);
1282             }
1283             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1284             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1285         }
1286     }
1287
1288     if (DOMAINDECOMP(cr))
1289     {
1290         /* We are done with the CPU compute.
1291          * We will now communicate the non-local forces.
1292          * If we use a GPU this will overlap with GPU work, so in that case
1293          * we do not close the DD force balancing region here.
1294          */
1295         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1296         {
1297             ddCloseBalanceRegionCpu(cr->dd);
1298         }
1299         if (bDoForces)
1300         {
1301             wallcycle_start(wcycle, ewcMOVEF);
1302             dd_move_f(cr->dd, f, fr->fshift);
1303             wallcycle_stop(wcycle, ewcMOVEF);
1304         }
1305     }
1306
1307     if (bUseOrEmulGPU)
1308     {
1309         /* wait for local forces (or calculate in emulation mode) */
1310         if (bUseGPU)
1311         {
1312             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1313              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1314              * but even with a step of 0.1 ms the difference is less than 1%
1315              * of the step time.
1316              */
1317             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1318
1319             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1320             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1321                                    flags, eatLocal,
1322                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1323                                    fr->fshift);
1324             float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1325
1326             if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1327             {
1328                 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1329                 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1330                 {
1331                     /* We measured few cycles, it could be that the kernel
1332                      * and transfer finished earlier and there was no actual
1333                      * wait time, only API call overhead.
1334                      * Then the actual time could be anywhere between 0 and
1335                      * cycles_wait_est. We will use half of cycles_wait_est.
1336                      */
1337                     waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1338                 }
1339                 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1340             }
1341
1342             /* now clear the GPU outputs while we finish the step on the CPU */
1343             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1344             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1345
1346             /* Is dynamic pair-list pruning activated? */
1347             if (nbv->listParams->useDynamicPruning)
1348             {
1349                 /* We should not launch the rolling pruning kernel at a search
1350                  * step or just before search steps, since that's useless.
1351                  * Without domain decomposition we prune at even steps.
1352                  * With domain decomposition we alternate local and non-local
1353                  * pruning at even and odd steps.
1354                  */
1355                 int  numRollingParts     = nbv->listParams->numRollingParts;
1356                 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1357                 int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1358                 bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1359                 if (stepWithCurrentList > 0 &&
1360                     stepWithCurrentList < inputrec->nstlist - 1 &&
1361                     (stepIsEven || DOMAINDECOMP(cr)))
1362                 {
1363                     nbnxn_gpu_launch_kernel_pruneonly(fr->nbv->gpu_nbv,
1364                                                       stepIsEven ? eintLocal : eintNonlocal,
1365                                                       numRollingParts);
1366                 }
1367             }
1368             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1369         }
1370         else
1371         {
1372             wallcycle_start_nocount(wcycle, ewcFORCE);
1373             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1374                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1375                          step, nrnb, wcycle);
1376             wallcycle_stop(wcycle, ewcFORCE);
1377         }
1378         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1379         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1380         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1381                                        nbv->grp[eintLocal].nbat, f);
1382         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1383         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1384     }
1385
1386     if (DOMAINDECOMP(cr))
1387     {
1388         dd_force_flop_stop(cr->dd, nrnb);
1389     }
1390
1391     if (bDoForces)
1392     {
1393         /* Collect forces from modules */
1394         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1395         if (fr->bF_NoVirSum)
1396         {
1397             fNoVirSum = *fr->f_novirsum;
1398         }
1399         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1400
1401         /* If we have NoVirSum forces, but we do not calculate the virial,
1402          * we sum fr->f_novirsum=f later.
1403          */
1404         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1405         {
1406             wallcycle_start(wcycle, ewcVSITESPREAD);
1407             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1408                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1409             wallcycle_stop(wcycle, ewcVSITESPREAD);
1410         }
1411
1412         if (flags & GMX_FORCE_VIRIAL)
1413         {
1414             /* Calculation of the virial must be done after vsites! */
1415             calc_virial(0, mdatoms->homenr, x, f,
1416                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1417         }
1418     }
1419
1420     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1421     {
1422         /* Since the COM pulling is always done mass-weighted, no forces are
1423          * applied to vsites and this call can be done after vsite spreading.
1424          */
1425         pull_potential_wrapper(cr, inputrec, box, x,
1426                                f, vir_force, mdatoms, enerd, lambda, t,
1427                                wcycle);
1428     }
1429
1430     /* Add the forces from enforced rotation potentials (if any) */
1431     if (inputrec->bRot)
1432     {
1433         wallcycle_start(wcycle, ewcROTadd);
1434         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1435         wallcycle_stop(wcycle, ewcROTadd);
1436     }
1437
1438     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1439     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1440
1441     if (PAR(cr) && !(cr->duty & DUTY_PME))
1442     {
1443         /* In case of node-splitting, the PP nodes receive the long-range
1444          * forces, virial and energy from the PME nodes here.
1445          */
1446         pme_receive_force_ener(cr, wcycle, enerd, fr);
1447     }
1448
1449     if (bDoForces)
1450     {
1451         post_process_forces(cr, step, nrnb, wcycle,
1452                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1453                             flags);
1454     }
1455
1456     if (flags & GMX_FORCE_ENERGY)
1457     {
1458         /* Sum the potential energy terms from group contributions */
1459         sum_epot(&(enerd->grpp), enerd->term);
1460
1461         if (!EI_TPI(inputrec->eI))
1462         {
1463             checkPotentialEnergyValidity(enerd);
1464         }
1465     }
1466 }
1467
1468 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1469                         t_inputrec *inputrec,
1470                         gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1471                         gmx_localtop_t *top,
1472                         gmx_groups_t *groups,
1473                         matrix box, rvec x[], history_t *hist,
1474                         PaddedRVecVector *force,
1475                         tensor vir_force,
1476                         t_mdatoms *mdatoms,
1477                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1478                         real *lambda, t_graph *graph,
1479                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1480                         double t, gmx_edsam_t ed,
1481                         gmx_bool bBornRadii,
1482                         int flags,
1483                         DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1484                         DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1485 {
1486     int        cg0, cg1, i, j;
1487     double     mu[2*DIM];
1488     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1489     gmx_bool   bDoForces;
1490     float      cycles_pme;
1491
1492     const int  start  = 0;
1493     const int  homenr = mdatoms->homenr;
1494
1495     clear_mat(vir_force);
1496
1497     cg0 = 0;
1498     if (DOMAINDECOMP(cr))
1499     {
1500         cg1 = cr->dd->ncg_tot;
1501     }
1502     else
1503     {
1504         cg1 = top->cgs.nr;
1505     }
1506     if (fr->n_tpi > 0)
1507     {
1508         cg1--;
1509     }
1510
1511     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1512     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1513     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1514     bFillGrid      = (bNS && bStateChanged);
1515     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1516     bDoForces      = (flags & GMX_FORCE_FORCES);
1517
1518     if (bStateChanged)
1519     {
1520         update_forcerec(fr, box);
1521
1522         if (inputrecNeedMutot(inputrec))
1523         {
1524             /* Calculate total (local) dipole moment in a temporary common array.
1525              * This makes it possible to sum them over nodes faster.
1526              */
1527             calc_mu(start, homenr,
1528                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1529                     mu, mu+DIM);
1530         }
1531     }
1532
1533     if (fr->ePBC != epbcNONE)
1534     {
1535         /* Compute shift vectors every step,
1536          * because of pressure coupling or box deformation!
1537          */
1538         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1539         {
1540             calc_shifts(box, fr->shift_vec);
1541         }
1542
1543         if (bCalcCGCM)
1544         {
1545             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1546                                      &(top->cgs), x, fr->cg_cm);
1547             inc_nrnb(nrnb, eNR_CGCM, homenr);
1548             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1549         }
1550         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1551         {
1552             unshift_self(graph, box, x);
1553         }
1554     }
1555     else if (bCalcCGCM)
1556     {
1557         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1558         inc_nrnb(nrnb, eNR_CGCM, homenr);
1559     }
1560
1561     if (bCalcCGCM && gmx_debug_at)
1562     {
1563         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1564     }
1565
1566 #if GMX_MPI
1567     if (!(cr->duty & DUTY_PME))
1568     {
1569         gmx_bool bBS;
1570         matrix   boxs;
1571
1572         /* Send particle coordinates to the pme nodes.
1573          * Since this is only implemented for domain decomposition
1574          * and domain decomposition does not use the graph,
1575          * we do not need to worry about shifting.
1576          */
1577
1578         wallcycle_start(wcycle, ewcPP_PMESENDX);
1579
1580         bBS = (inputrec->nwall == 2);
1581         if (bBS)
1582         {
1583             copy_mat(box, boxs);
1584             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1585         }
1586
1587         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1588                                  lambda[efptCOUL], lambda[efptVDW],
1589                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1590                                  step);
1591
1592         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1593     }
1594 #endif /* GMX_MPI */
1595
1596     /* Communicate coordinates and sum dipole if necessary */
1597     if (DOMAINDECOMP(cr))
1598     {
1599         wallcycle_start(wcycle, ewcMOVEX);
1600         dd_move_x(cr->dd, box, x);
1601         wallcycle_stop(wcycle, ewcMOVEX);
1602         /* No GPU support, no move_x overlap, so reopen the balance region here */
1603         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1604         {
1605             ddReopenBalanceRegionCpu(cr->dd);
1606         }
1607     }
1608
1609     if (inputrecNeedMutot(inputrec))
1610     {
1611         if (bStateChanged)
1612         {
1613             if (PAR(cr))
1614             {
1615                 gmx_sumd(2*DIM, mu, cr);
1616                 ddReopenBalanceRegionCpu(cr->dd);
1617             }
1618             for (i = 0; i < 2; i++)
1619             {
1620                 for (j = 0; j < DIM; j++)
1621                 {
1622                     fr->mu_tot[i][j] = mu[i*DIM + j];
1623                 }
1624             }
1625         }
1626         if (fr->efep == efepNO)
1627         {
1628             copy_rvec(fr->mu_tot[0], mu_tot);
1629         }
1630         else
1631         {
1632             for (j = 0; j < DIM; j++)
1633             {
1634                 mu_tot[j] =
1635                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1636             }
1637         }
1638     }
1639
1640     /* Reset energies */
1641     reset_enerdata(enerd);
1642     clear_rvecs(SHIFTS, fr->fshift);
1643
1644     if (bNS)
1645     {
1646         wallcycle_start(wcycle, ewcNS);
1647
1648         if (graph && bStateChanged)
1649         {
1650             /* Calculate intramolecular shift vectors to make molecules whole */
1651             mk_mshift(fplog, graph, fr->ePBC, box, x);
1652         }
1653
1654         /* Do the actual neighbour searching */
1655         ns(fplog, fr, box,
1656            groups, top, mdatoms,
1657            cr, nrnb, bFillGrid);
1658
1659         wallcycle_stop(wcycle, ewcNS);
1660     }
1661
1662     if (inputrec->implicit_solvent && bNS)
1663     {
1664         make_gb_nblist(cr, inputrec->gb_algorithm,
1665                        x, box, fr, &top->idef, graph, fr->born);
1666     }
1667
1668     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1669     {
1670         wallcycle_start(wcycle, ewcPPDURINGPME);
1671         dd_force_flop_start(cr->dd, nrnb);
1672     }
1673
1674     if (inputrec->bRot)
1675     {
1676         wallcycle_start(wcycle, ewcROT);
1677         do_rotation(cr, inputrec, box, x, t, step, bNS);
1678         wallcycle_stop(wcycle, ewcROT);
1679     }
1680
1681     /* Temporary solution until all routines take PaddedRVecVector */
1682     rvec *f = as_rvec_array(force->data());
1683
1684     /* Start the force cycle counter.
1685      * Note that a different counter is used for dynamic load balancing.
1686      */
1687     wallcycle_start(wcycle, ewcFORCE);
1688
1689     if (bDoForces)
1690     {
1691         /* Reset forces for which the virial is calculated separately:
1692          * PME/Ewald forces if necessary */
1693         if (fr->bF_NoVirSum)
1694         {
1695             if (flags & GMX_FORCE_VIRIAL)
1696             {
1697                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1698                 /* TODO: remove this - 1 when padding is properly implemented */
1699                 clear_rvecs(fr->f_novirsum->size() - 1,
1700                             as_rvec_array(fr->f_novirsum->data()));
1701             }
1702             else
1703             {
1704                 /* We are not calculating the pressure so we do not need
1705                  * a separate array for forces that do not contribute
1706                  * to the pressure.
1707                  */
1708                 fr->f_novirsum = force;
1709             }
1710         }
1711
1712         /* Clear the short- and long-range forces */
1713         clear_rvecs(fr->natoms_force_constr, f);
1714
1715         clear_rvec(fr->vir_diag_posres);
1716     }
1717     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1718     {
1719         clear_pull_forces(inputrec->pull_work);
1720     }
1721
1722     /* update QMMMrec, if necessary */
1723     if (fr->bQMMM)
1724     {
1725         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1726     }
1727
1728     /* Compute the bonded and non-bonded energies and optionally forces */
1729     do_force_lowlevel(fr, inputrec, &(top->idef),
1730                       cr, nrnb, wcycle, mdatoms,
1731                       x, hist, f, enerd, fcd, top, fr->born,
1732                       bBornRadii, box,
1733                       inputrec->fepvals, lambda,
1734                       graph, &(top->excls), fr->mu_tot,
1735                       flags,
1736                       &cycles_pme);
1737
1738     wallcycle_stop(wcycle, ewcFORCE);
1739
1740     if (DOMAINDECOMP(cr))
1741     {
1742         dd_force_flop_stop(cr->dd, nrnb);
1743
1744         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1745         {
1746             ddCloseBalanceRegionCpu(cr->dd);
1747         }
1748     }
1749
1750     if (ed)
1751     {
1752         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1753     }
1754
1755     if (bDoForces)
1756     {
1757         /* Collect forces from modules */
1758         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1759         if (fr->bF_NoVirSum)
1760         {
1761             fNoVirSum = *fr->f_novirsum;
1762         }
1763         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1764
1765         /* Communicate the forces */
1766         if (DOMAINDECOMP(cr))
1767         {
1768             wallcycle_start(wcycle, ewcMOVEF);
1769             dd_move_f(cr->dd, f, fr->fshift);
1770             /* Do we need to communicate the separate force array
1771              * for terms that do not contribute to the single sum virial?
1772              * Position restraints and electric fields do not introduce
1773              * inter-cg forces, only full electrostatics methods do.
1774              * When we do not calculate the virial, fr->f_novirsum = f,
1775              * so we have already communicated these forces.
1776              */
1777             if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1778                 (flags & GMX_FORCE_VIRIAL))
1779             {
1780                 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1781             }
1782             wallcycle_stop(wcycle, ewcMOVEF);
1783         }
1784
1785         /* If we have NoVirSum forces, but we do not calculate the virial,
1786          * we sum fr->f_novirsum=f later.
1787          */
1788         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1789         {
1790             wallcycle_start(wcycle, ewcVSITESPREAD);
1791             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1792                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1793             wallcycle_stop(wcycle, ewcVSITESPREAD);
1794         }
1795
1796         if (flags & GMX_FORCE_VIRIAL)
1797         {
1798             /* Calculation of the virial must be done after vsites! */
1799             calc_virial(0, mdatoms->homenr, x, f,
1800                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1801         }
1802     }
1803
1804     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1805     {
1806         pull_potential_wrapper(cr, inputrec, box, x,
1807                                f, vir_force, mdatoms, enerd, lambda, t,
1808                                wcycle);
1809     }
1810
1811     /* Add the forces from enforced rotation potentials (if any) */
1812     if (inputrec->bRot)
1813     {
1814         wallcycle_start(wcycle, ewcROTadd);
1815         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1816         wallcycle_stop(wcycle, ewcROTadd);
1817     }
1818
1819     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1820     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1821
1822     if (PAR(cr) && !(cr->duty & DUTY_PME))
1823     {
1824         /* In case of node-splitting, the PP nodes receive the long-range
1825          * forces, virial and energy from the PME nodes here.
1826          */
1827         pme_receive_force_ener(cr, wcycle, enerd, fr);
1828     }
1829
1830     if (bDoForces)
1831     {
1832         post_process_forces(cr, step, nrnb, wcycle,
1833                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1834                             flags);
1835     }
1836
1837     if (flags & GMX_FORCE_ENERGY)
1838     {
1839         /* Sum the potential energy terms from group contributions */
1840         sum_epot(&(enerd->grpp), enerd->term);
1841
1842         if (!EI_TPI(inputrec->eI))
1843         {
1844             checkPotentialEnergyValidity(enerd);
1845         }
1846     }
1847
1848 }
1849
1850 void do_force(FILE *fplog, t_commrec *cr,
1851               t_inputrec *inputrec,
1852               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1853               gmx_localtop_t *top,
1854               gmx_groups_t *groups,
1855               matrix box, PaddedRVecVector *coordinates, history_t *hist,
1856               PaddedRVecVector *force,
1857               tensor vir_force,
1858               t_mdatoms *mdatoms,
1859               gmx_enerdata_t *enerd, t_fcdata *fcd,
1860               gmx::ArrayRef<real> lambda, t_graph *graph,
1861               t_forcerec *fr,
1862               gmx_vsite_t *vsite, rvec mu_tot,
1863               double t, gmx_edsam_t ed,
1864               gmx_bool bBornRadii,
1865               int flags,
1866               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1867               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1868 {
1869     /* modify force flag if not doing nonbonded */
1870     if (!fr->bNonbonded)
1871     {
1872         flags &= ~GMX_FORCE_NONBONDED;
1873     }
1874
1875     GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1876                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1877     GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1878                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1879
1880     rvec *x = as_rvec_array(coordinates->data());
1881
1882     switch (inputrec->cutoff_scheme)
1883     {
1884         case ecutsVERLET:
1885             do_force_cutsVERLET(fplog, cr, inputrec,
1886                                 step, nrnb, wcycle,
1887                                 top,
1888                                 groups,
1889                                 box, x, hist,
1890                                 force, vir_force,
1891                                 mdatoms,
1892                                 enerd, fcd,
1893                                 lambda.data(), graph,
1894                                 fr, fr->ic,
1895                                 vsite, mu_tot,
1896                                 t, ed,
1897                                 bBornRadii,
1898                                 flags,
1899                                 ddOpenBalanceRegion,
1900                                 ddCloseBalanceRegion);
1901             break;
1902         case ecutsGROUP:
1903             do_force_cutsGROUP(fplog, cr, inputrec,
1904                                step, nrnb, wcycle,
1905                                top,
1906                                groups,
1907                                box, x, hist,
1908                                force, vir_force,
1909                                mdatoms,
1910                                enerd, fcd,
1911                                lambda.data(), graph,
1912                                fr, vsite, mu_tot,
1913                                t, ed,
1914                                bBornRadii,
1915                                flags,
1916                                ddOpenBalanceRegion,
1917                                ddCloseBalanceRegion);
1918             break;
1919         default:
1920             gmx_incons("Invalid cut-off scheme passed!");
1921     }
1922
1923     /* In case we don't have constraints and are using GPUs, the next balancing
1924      * region starts here.
1925      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1926      * virial calculation and COM pulling, is not thus not included in
1927      * the balance timing, which is ok as most tasks do communication.
1928      */
1929     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1930     {
1931         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1932     }
1933 }
1934
1935
1936 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1937                         t_inputrec *ir, t_mdatoms *md,
1938                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1939                         t_forcerec *fr, gmx_localtop_t *top)
1940 {
1941     int             i, m, start, end;
1942     gmx_int64_t     step;
1943     real            dt = ir->delta_t;
1944     real            dvdl_dum;
1945     rvec           *savex;
1946
1947     /* We need to allocate one element extra, since we might use
1948      * (unaligned) 4-wide SIMD loads to access rvec entries.
1949      */
1950     snew(savex, state->natoms + 1);
1951
1952     start = 0;
1953     end   = md->homenr;
1954
1955     if (debug)
1956     {
1957         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1958                 start, md->homenr, end);
1959     }
1960     /* Do a first constrain to reset particles... */
1961     step = ir->init_step;
1962     if (fplog)
1963     {
1964         char buf[STEPSTRSIZE];
1965         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1966                 gmx_step_str(step, buf));
1967     }
1968     dvdl_dum = 0;
1969
1970     /* constrain the current position */
1971     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1972               ir, cr, step, 0, 1.0, md,
1973               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1974               fr->bMolPBC, state->box,
1975               state->lambda[efptBONDED], &dvdl_dum,
1976               nullptr, nullptr, nrnb, econqCoord);
1977     if (EI_VV(ir->eI))
1978     {
1979         /* constrain the inital velocity, and save it */
1980         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1981         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1982                   ir, cr, step, 0, 1.0, md,
1983                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1984                   fr->bMolPBC, state->box,
1985                   state->lambda[efptBONDED], &dvdl_dum,
1986                   nullptr, nullptr, nrnb, econqVeloc);
1987     }
1988     /* constrain the inital velocities at t-dt/2 */
1989     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1990     {
1991         for (i = start; (i < end); i++)
1992         {
1993             for (m = 0; (m < DIM); m++)
1994             {
1995                 /* Reverse the velocity */
1996                 state->v[i][m] = -state->v[i][m];
1997                 /* Store the position at t-dt in buf */
1998                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1999             }
2000         }
2001         /* Shake the positions at t=-dt with the positions at t=0
2002          * as reference coordinates.
2003          */
2004         if (fplog)
2005         {
2006             char buf[STEPSTRSIZE];
2007             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2008                     gmx_step_str(step, buf));
2009         }
2010         dvdl_dum = 0;
2011         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2012                   ir, cr, step, -1, 1.0, md,
2013                   as_rvec_array(state->x.data()), savex, nullptr,
2014                   fr->bMolPBC, state->box,
2015                   state->lambda[efptBONDED], &dvdl_dum,
2016                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2017
2018         for (i = start; i < end; i++)
2019         {
2020             for (m = 0; m < DIM; m++)
2021             {
2022                 /* Re-reverse the velocities */
2023                 state->v[i][m] = -state->v[i][m];
2024             }
2025         }
2026     }
2027     sfree(savex);
2028 }
2029
2030
2031 static void
2032 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2033                 double *enerout, double *virout)
2034 {
2035     double enersum, virsum;
2036     double invscale, invscale2, invscale3;
2037     double r, ea, eb, ec, pa, pb, pc, pd;
2038     double y0, f, g, h;
2039     int    ri, offset;
2040     double tabfactor;
2041
2042     invscale  = 1.0/scale;
2043     invscale2 = invscale*invscale;
2044     invscale3 = invscale*invscale2;
2045
2046     /* Following summation derived from cubic spline definition,
2047      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2048      * the cubic spline.  We first calculate the negative of the
2049      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2050      * add the more standard, abrupt cutoff correction to that result,
2051      * yielding the long-range correction for a switched function.  We
2052      * perform both the pressure and energy loops at the same time for
2053      * simplicity, as the computational cost is low. */
2054
2055     if (offstart == 0)
2056     {
2057         /* Since the dispersion table has been scaled down a factor
2058          * 6.0 and the repulsion a factor 12.0 to compensate for the
2059          * c6/c12 parameters inside nbfp[] being scaled up (to save
2060          * flops in kernels), we need to correct for this.
2061          */
2062         tabfactor = 6.0;
2063     }
2064     else
2065     {
2066         tabfactor = 12.0;
2067     }
2068
2069     enersum = 0.0;
2070     virsum  = 0.0;
2071     for (ri = rstart; ri < rend; ++ri)
2072     {
2073         r  = ri*invscale;
2074         ea = invscale3;
2075         eb = 2.0*invscale2*r;
2076         ec = invscale*r*r;
2077
2078         pa = invscale3;
2079         pb = 3.0*invscale2*r;
2080         pc = 3.0*invscale*r*r;
2081         pd = r*r*r;
2082
2083         /* this "8" is from the packing in the vdwtab array - perhaps
2084            should be defined? */
2085
2086         offset = 8*ri + offstart;
2087         y0     = vdwtab[offset];
2088         f      = vdwtab[offset+1];
2089         g      = vdwtab[offset+2];
2090         h      = vdwtab[offset+3];
2091
2092         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2093         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2094     }
2095     *enerout = 4.0*M_PI*enersum*tabfactor;
2096     *virout  = 4.0*M_PI*virsum*tabfactor;
2097 }
2098
2099 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2100 {
2101     double   eners[2], virs[2], enersum, virsum;
2102     double   r0, rc3, rc9;
2103     int      ri0, ri1, i;
2104     real     scale, *vdwtab;
2105
2106     fr->enershiftsix    = 0;
2107     fr->enershifttwelve = 0;
2108     fr->enerdiffsix     = 0;
2109     fr->enerdifftwelve  = 0;
2110     fr->virdiffsix      = 0;
2111     fr->virdifftwelve   = 0;
2112
2113     if (eDispCorr != edispcNO)
2114     {
2115         for (i = 0; i < 2; i++)
2116         {
2117             eners[i] = 0;
2118             virs[i]  = 0;
2119         }
2120         if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2121             (fr->vdw_modifier == eintmodPOTSWITCH) ||
2122             (fr->vdw_modifier == eintmodFORCESWITCH) ||
2123             (fr->vdwtype == evdwSHIFT) ||
2124             (fr->vdwtype == evdwSWITCH))
2125         {
2126             if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2127                  (fr->vdw_modifier == eintmodFORCESWITCH) ||
2128                  (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2129             {
2130                 gmx_fatal(FARGS,
2131                           "With dispersion correction rvdw-switch can not be zero "
2132                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2133             }
2134
2135             /* TODO This code depends on the logic in tables.c that
2136                constructs the table layout, which should be made
2137                explicit in future cleanup. */
2138             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2139                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2140             scale  = fr->dispersionCorrectionTable->scale;
2141             vdwtab = fr->dispersionCorrectionTable->data;
2142
2143             /* Round the cut-offs to exact table values for precision */
2144             ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
2145             ri1  = static_cast<int>(ceil(fr->rvdw*scale));
2146
2147             /* The code below has some support for handling force-switching, i.e.
2148              * when the force (instead of potential) is switched over a limited
2149              * region. This leads to a constant shift in the potential inside the
2150              * switching region, which we can handle by adding a constant energy
2151              * term in the force-switch case just like when we do potential-shift.
2152              *
2153              * For now this is not enabled, but to keep the functionality in the
2154              * code we check separately for switch and shift. When we do force-switch
2155              * the shifting point is rvdw_switch, while it is the cutoff when we
2156              * have a classical potential-shift.
2157              *
2158              * For a pure potential-shift the potential has a constant shift
2159              * all the way out to the cutoff, and that is it. For other forms
2160              * we need to calculate the constant shift up to the point where we
2161              * start modifying the potential.
2162              */
2163             ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2164
2165             r0   = ri0/scale;
2166             rc3  = r0*r0*r0;
2167             rc9  = rc3*rc3*rc3;
2168
2169             if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2170                 (fr->vdwtype == evdwSHIFT))
2171             {
2172                 /* Determine the constant energy shift below rvdw_switch.
2173                  * Table has a scale factor since we have scaled it down to compensate
2174                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2175                  */
2176                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2177                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2178             }
2179             else if (fr->vdw_modifier == eintmodPOTSHIFT)
2180             {
2181                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2182                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2183             }
2184
2185             /* Add the constant part from 0 to rvdw_switch.
2186              * This integration from 0 to rvdw_switch overcounts the number
2187              * of interactions by 1, as it also counts the self interaction.
2188              * We will correct for this later.
2189              */
2190             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2191             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2192
2193             /* Calculate the contribution in the range [r0,r1] where we
2194              * modify the potential. For a pure potential-shift modifier we will
2195              * have ri0==ri1, and there will not be any contribution here.
2196              */
2197             for (i = 0; i < 2; i++)
2198             {
2199                 enersum = 0;
2200                 virsum  = 0;
2201                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2202                 eners[i] -= enersum;
2203                 virs[i]  -= virsum;
2204             }
2205
2206             /* Alright: Above we compensated by REMOVING the parts outside r0
2207              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2208              *
2209              * Regardless of whether r0 is the point where we start switching,
2210              * or the cutoff where we calculated the constant shift, we include
2211              * all the parts we are missing out to infinity from r0 by
2212              * calculating the analytical dispersion correction.
2213              */
2214             eners[0] += -4.0*M_PI/(3.0*rc3);
2215             eners[1] +=  4.0*M_PI/(9.0*rc9);
2216             virs[0]  +=  8.0*M_PI/rc3;
2217             virs[1]  += -16.0*M_PI/(3.0*rc9);
2218         }
2219         else if (fr->vdwtype == evdwCUT ||
2220                  EVDW_PME(fr->vdwtype) ||
2221                  fr->vdwtype == evdwUSER)
2222         {
2223             if (fr->vdwtype == evdwUSER && fplog)
2224             {
2225                 fprintf(fplog,
2226                         "WARNING: using dispersion correction with user tables\n");
2227             }
2228
2229             /* Note that with LJ-PME, the dispersion correction is multiplied
2230              * by the difference between the actual C6 and the value of C6
2231              * that would produce the combination rule.
2232              * This means the normal energy and virial difference formulas
2233              * can be used here.
2234              */
2235
2236             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2237             rc9  = rc3*rc3*rc3;
2238             /* Contribution beyond the cut-off */
2239             eners[0] += -4.0*M_PI/(3.0*rc3);
2240             eners[1] +=  4.0*M_PI/(9.0*rc9);
2241             if (fr->vdw_modifier == eintmodPOTSHIFT)
2242             {
2243                 /* Contribution within the cut-off */
2244                 eners[0] += -4.0*M_PI/(3.0*rc3);
2245                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2246             }
2247             /* Contribution beyond the cut-off */
2248             virs[0]  +=  8.0*M_PI/rc3;
2249             virs[1]  += -16.0*M_PI/(3.0*rc9);
2250         }
2251         else
2252         {
2253             gmx_fatal(FARGS,
2254                       "Dispersion correction is not implemented for vdw-type = %s",
2255                       evdw_names[fr->vdwtype]);
2256         }
2257
2258         /* When we deprecate the group kernels the code below can go too */
2259         if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2260         {
2261             /* Calculate self-interaction coefficient (assuming that
2262              * the reciprocal-space contribution is constant in the
2263              * region that contributes to the self-interaction).
2264              */
2265             fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2266
2267             eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2268             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2269         }
2270
2271         fr->enerdiffsix    = eners[0];
2272         fr->enerdifftwelve = eners[1];
2273         /* The 0.5 is due to the Gromacs definition of the virial */
2274         fr->virdiffsix     = 0.5*virs[0];
2275         fr->virdifftwelve  = 0.5*virs[1];
2276     }
2277 }
2278
2279 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2280                    matrix box, real lambda, tensor pres, tensor virial,
2281                    real *prescorr, real *enercorr, real *dvdlcorr)
2282 {
2283     gmx_bool bCorrAll, bCorrPres;
2284     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2285     int      m;
2286
2287     *prescorr = 0;
2288     *enercorr = 0;
2289     *dvdlcorr = 0;
2290
2291     clear_mat(virial);
2292     clear_mat(pres);
2293
2294     if (ir->eDispCorr != edispcNO)
2295     {
2296         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2297                      ir->eDispCorr == edispcAllEnerPres);
2298         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2299                      ir->eDispCorr == edispcAllEnerPres);
2300
2301         invvol = 1/det(box);
2302         if (fr->n_tpi)
2303         {
2304             /* Only correct for the interactions with the inserted molecule */
2305             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2306             ninter = fr->n_tpi;
2307         }
2308         else
2309         {
2310             dens   = fr->numAtomsForDispersionCorrection*invvol;
2311             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2312         }
2313
2314         if (ir->efep == efepNO)
2315         {
2316             avcsix    = fr->avcsix[0];
2317             avctwelve = fr->avctwelve[0];
2318         }
2319         else
2320         {
2321             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2322             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2323         }
2324
2325         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2326         *enercorr += avcsix*enerdiff;
2327         dvdlambda  = 0.0;
2328         if (ir->efep != efepNO)
2329         {
2330             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2331         }
2332         if (bCorrAll)
2333         {
2334             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2335             *enercorr += avctwelve*enerdiff;
2336             if (fr->efep != efepNO)
2337             {
2338                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2339             }
2340         }
2341
2342         if (bCorrPres)
2343         {
2344             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2345             if (ir->eDispCorr == edispcAllEnerPres)
2346             {
2347                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2348             }
2349             /* The factor 2 is because of the Gromacs virial definition */
2350             spres = -2.0*invvol*svir*PRESFAC;
2351
2352             for (m = 0; m < DIM; m++)
2353             {
2354                 virial[m][m] += svir;
2355                 pres[m][m]   += spres;
2356             }
2357             *prescorr += spres;
2358         }
2359
2360         /* Can't currently control when it prints, for now, just print when degugging */
2361         if (debug)
2362         {
2363             if (bCorrAll)
2364             {
2365                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2366                         avcsix, avctwelve);
2367             }
2368             if (bCorrPres)
2369             {
2370                 fprintf(debug,
2371                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2372                         *enercorr, spres, svir);
2373             }
2374             else
2375             {
2376                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2377             }
2378         }
2379
2380         if (fr->efep != efepNO)
2381         {
2382             *dvdlcorr += dvdlambda;
2383         }
2384     }
2385 }
2386
2387 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2388                             const gmx_mtop_t *mtop, rvec x[],
2389                             gmx_bool bFirst)
2390 {
2391     t_graph        *graph;
2392     int             mb, as, mol;
2393     gmx_molblock_t *molb;
2394
2395     if (bFirst && fplog)
2396     {
2397         fprintf(fplog, "Removing pbc first time\n");
2398     }
2399
2400     snew(graph, 1);
2401     as = 0;
2402     for (mb = 0; mb < mtop->nmolblock; mb++)
2403     {
2404         molb = &mtop->molblock[mb];
2405         if (molb->natoms_mol == 1 ||
2406             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2407         {
2408             /* Just one atom or charge group in the molecule, no PBC required */
2409             as += molb->nmol*molb->natoms_mol;
2410         }
2411         else
2412         {
2413             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2414             mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2415                            0, molb->natoms_mol, FALSE, FALSE, graph);
2416
2417             for (mol = 0; mol < molb->nmol; mol++)
2418             {
2419                 mk_mshift(fplog, graph, ePBC, box, x+as);
2420
2421                 shift_self(graph, box, x+as);
2422                 /* The molecule is whole now.
2423                  * We don't need the second mk_mshift call as in do_pbc_first,
2424                  * since we no longer need this graph.
2425                  */
2426
2427                 as += molb->natoms_mol;
2428             }
2429             done_graph(graph);
2430         }
2431     }
2432     sfree(graph);
2433 }
2434
2435 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2436                        const gmx_mtop_t *mtop, rvec x[])
2437 {
2438     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2439 }
2440
2441 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2442                  gmx_mtop_t *mtop, rvec x[])
2443 {
2444     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2445 }
2446
2447 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2448 {
2449     int t, nth;
2450     nth = gmx_omp_nthreads_get(emntDefault);
2451
2452 #pragma omp parallel for num_threads(nth) schedule(static)
2453     for (t = 0; t < nth; t++)
2454     {
2455         try
2456         {
2457             int offset, len;
2458
2459             offset = (natoms*t    )/nth;
2460             len    = (natoms*(t + 1))/nth - offset;
2461             put_atoms_in_box(ePBC, box, len, x + offset);
2462         }
2463         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2464     }
2465 }
2466
2467 // TODO This can be cleaned up a lot, and move back to runner.cpp
2468 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2469                 const t_inputrec *inputrec,
2470                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2471                 gmx_walltime_accounting_t walltime_accounting,
2472                 nonbonded_verlet_t *nbv,
2473                 gmx_bool bWriteStat)
2474 {
2475     t_nrnb *nrnb_tot = nullptr;
2476     double  delta_t  = 0;
2477     double  nbfs     = 0, mflop = 0;
2478     double  elapsed_time,
2479             elapsed_time_over_all_ranks,
2480             elapsed_time_over_all_threads,
2481             elapsed_time_over_all_threads_over_all_ranks;
2482     /* Control whether it is valid to print a report. Only the
2483        simulation master may print, but it should not do so if the run
2484        terminated e.g. before a scheduled reset step. This is
2485        complicated by the fact that PME ranks are unaware of the
2486        reason why they were sent a pmerecvqxFINISH. To avoid
2487        communication deadlocks, we always do the communication for the
2488        report, even if we've decided not to write the report, because
2489        how long it takes to finish the run is not important when we've
2490        decided not to report on the simulation performance. */
2491     bool    printReport = SIMMASTER(cr);
2492
2493     if (!walltime_accounting_get_valid_finish(walltime_accounting))
2494     {
2495         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2496         printReport = false;
2497     }
2498
2499     if (cr->nnodes > 1)
2500     {
2501         snew(nrnb_tot, 1);
2502 #if GMX_MPI
2503         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2504                       cr->mpi_comm_mysim);
2505 #endif
2506     }
2507     else
2508     {
2509         nrnb_tot = nrnb;
2510     }
2511
2512     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2513     elapsed_time_over_all_ranks                  = elapsed_time;
2514     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2515     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2516 #if GMX_MPI
2517     if (cr->nnodes > 1)
2518     {
2519         /* reduce elapsed_time over all MPI ranks in the current simulation */
2520         MPI_Allreduce(&elapsed_time,
2521                       &elapsed_time_over_all_ranks,
2522                       1, MPI_DOUBLE, MPI_SUM,
2523                       cr->mpi_comm_mysim);
2524         elapsed_time_over_all_ranks /= cr->nnodes;
2525         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2526          * current simulation. */
2527         MPI_Allreduce(&elapsed_time_over_all_threads,
2528                       &elapsed_time_over_all_threads_over_all_ranks,
2529                       1, MPI_DOUBLE, MPI_SUM,
2530                       cr->mpi_comm_mysim);
2531     }
2532 #endif
2533
2534     if (printReport)
2535     {
2536         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2537     }
2538     if (cr->nnodes > 1)
2539     {
2540         sfree(nrnb_tot);
2541     }
2542
2543     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2544     {
2545         print_dd_statistics(cr, inputrec, fplog);
2546     }
2547
2548     /* TODO Move the responsibility for any scaling by thread counts
2549      * to the code that handled the thread region, so that there's a
2550      * mechanism to keep cycle counting working during the transition
2551      * to task parallelism. */
2552     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2553     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2554     wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2555     auto cycle_sum(wallcycle_sum(cr, wcycle));
2556
2557     if (printReport)
2558     {
2559         struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2560
2561         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2562                         elapsed_time_over_all_ranks,
2563                         wcycle, cycle_sum, gputimes);
2564
2565         if (EI_DYNAMICS(inputrec->eI))
2566         {
2567             delta_t = inputrec->delta_t;
2568         }
2569
2570         if (fplog)
2571         {
2572             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2573                        elapsed_time_over_all_ranks,
2574                        walltime_accounting_get_nsteps_done(walltime_accounting),
2575                        delta_t, nbfs, mflop);
2576         }
2577         if (bWriteStat)
2578         {
2579             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2580                        elapsed_time_over_all_ranks,
2581                        walltime_accounting_get_nsteps_done(walltime_accounting),
2582                        delta_t, nbfs, mflop);
2583         }
2584     }
2585 }
2586
2587 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2588 {
2589     /* this function works, but could probably use a logic rewrite to keep all the different
2590        types of efep straight. */
2591
2592     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2593     {
2594         return;
2595     }
2596
2597     t_lambda *fep = ir->fepvals;
2598     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2599                                             if checkpoint is set -- a kludge is in for now
2600                                             to prevent this.*/
2601
2602     for (int i = 0; i < efptNR; i++)
2603     {
2604         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2605         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2606         {
2607             lambda[i] = fep->init_lambda;
2608             if (lam0)
2609             {
2610                 lam0[i] = lambda[i];
2611             }
2612         }
2613         else
2614         {
2615             lambda[i] = fep->all_lambda[i][*fep_state];
2616             if (lam0)
2617             {
2618                 lam0[i] = lambda[i];
2619             }
2620         }
2621     }
2622     if (ir->bSimTemp)
2623     {
2624         /* need to rescale control temperatures to match current state */
2625         for (int i = 0; i < ir->opts.ngtc; i++)
2626         {
2627             if (ir->opts.ref_t[i] > 0)
2628             {
2629                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2630             }
2631         }
2632     }
2633
2634     /* Send to the log the information on the current lambdas */
2635     if (fplog != nullptr)
2636     {
2637         fprintf(fplog, "Initial vector of lambda components:[ ");
2638         for (const auto &l : lambda)
2639         {
2640             fprintf(fplog, "%10.4f ", l);
2641         }
2642         fprintf(fplog, "]\n");
2643     }
2644     return;
2645 }
2646
2647
2648 void init_md(FILE *fplog,
2649              t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2650              t_inputrec *ir, const gmx_output_env_t *oenv,
2651              double *t, double *t0,
2652              gmx::ArrayRef<real> lambda, int *fep_state, double *lam0,
2653              t_nrnb *nrnb, gmx_mtop_t *mtop,
2654              gmx_update_t **upd,
2655              int nfile, const t_filenm fnm[],
2656              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2657              tensor force_vir, tensor shake_vir, rvec mu_tot,
2658              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2659              gmx_wallcycle_t wcycle)
2660 {
2661     int  i;
2662
2663     /* Initial values */
2664     *t = *t0       = ir->init_t;
2665
2666     *bSimAnn = FALSE;
2667     for (i = 0; i < ir->opts.ngtc; i++)
2668     {
2669         /* set bSimAnn if any group is being annealed */
2670         if (ir->opts.annealing[i] != eannNO)
2671         {
2672             *bSimAnn = TRUE;
2673         }
2674     }
2675
2676     /* Initialize lambda variables */
2677     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2678
2679     // TODO upd is never NULL in practice, but the analysers don't know that
2680     if (upd)
2681     {
2682         *upd = init_update(ir);
2683     }
2684     if (*bSimAnn)
2685     {
2686         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2687     }
2688
2689     if (vcm != nullptr)
2690     {
2691         *vcm = init_vcm(fplog, &mtop->groups, ir);
2692     }
2693
2694     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2695     {
2696         if (ir->etc == etcBERENDSEN)
2697         {
2698             please_cite(fplog, "Berendsen84a");
2699         }
2700         if (ir->etc == etcVRESCALE)
2701         {
2702             please_cite(fplog, "Bussi2007a");
2703         }
2704         if (ir->eI == eiSD1)
2705         {
2706             please_cite(fplog, "Goga2012");
2707         }
2708     }
2709     init_nrnb(nrnb);
2710
2711     if (nfile != -1)
2712     {
2713         *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, outputProvider, ir, mtop, oenv, wcycle);
2714
2715         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? nullptr : mdoutf_get_fp_ene(*outf),
2716                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2717     }
2718
2719     /* Initiate variables */
2720     clear_mat(force_vir);
2721     clear_mat(shake_vir);
2722     clear_rvec(mu_tot);
2723 }