Add dynamic pair-list pruning framework
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <assert.h>
44 #include <math.h>
45 #include <stdio.h>
46 #include <string.h>
47
48 #include <cstdint>
49
50 #include <array>
51
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
112
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
115
116 void print_time(FILE                     *out,
117                 gmx_walltime_accounting_t walltime_accounting,
118                 gmx_int64_t               step,
119                 t_inputrec               *ir,
120                 t_commrec gmx_unused     *cr)
121 {
122     time_t finish;
123     char   timebuf[STRLEN];
124     double dt, elapsed_seconds, time_per_step;
125     char   buf[48];
126
127 #if !GMX_THREAD_MPI
128     if (!PAR(cr))
129 #endif
130     {
131         fprintf(out, "\r");
132     }
133     fprintf(out, "step %s", gmx_step_str(step, buf));
134     fflush(out);
135
136     if ((step >= ir->nstlist))
137     {
138         double seconds_since_epoch = gmx_gettime();
139         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
140         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
141         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
142
143         if (ir->nsteps >= 0)
144         {
145             if (dt >= 300)
146             {
147                 finish = (time_t) (seconds_since_epoch + dt);
148                 gmx_ctime_r(&finish, timebuf, STRLEN);
149                 sprintf(buf, "%s", timebuf);
150                 buf[strlen(buf)-1] = '\0';
151                 fprintf(out, ", will finish %s", buf);
152             }
153             else
154             {
155                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
156             }
157         }
158         else
159         {
160             fprintf(out, " performance: %.1f ns/day    ",
161                     ir->delta_t/1000*24*60*60/time_per_step);
162         }
163     }
164 #if !GMX_THREAD_MPI
165     if (PAR(cr))
166     {
167         fprintf(out, "\n");
168     }
169 #endif
170
171     fflush(out);
172 }
173
174 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
175                          double the_time)
176 {
177     char   time_string[STRLEN];
178
179     if (!fplog)
180     {
181         return;
182     }
183
184     {
185         int    i;
186         char   timebuf[STRLEN];
187         time_t temp_time = (time_t) the_time;
188
189         gmx_ctime_r(&temp_time, timebuf, STRLEN);
190         for (i = 0; timebuf[i] >= ' '; i++)
191         {
192             time_string[i] = timebuf[i];
193         }
194         time_string[i] = '\0';
195     }
196
197     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
198 }
199
200 void print_start(FILE *fplog, t_commrec *cr,
201                  gmx_walltime_accounting_t walltime_accounting,
202                  const char *name)
203 {
204     char buf[STRLEN];
205
206     sprintf(buf, "Started %s", name);
207     print_date_and_time(fplog, cr->nodeid, buf,
208                         walltime_accounting_get_start_time_stamp(walltime_accounting));
209 }
210
211 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
212 {
213     /* TODO: remove this - 1 when padding is properly implemented */
214     int         end  = forceToAdd->size() - 1;
215     const rvec *fAdd = as_rvec_array(forceToAdd->data());
216
217     // cppcheck-suppress unreadVariable
218     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
219 #pragma omp parallel for num_threads(nt) schedule(static)
220     for (int i = 0; i < end; i++)
221     {
222         rvec_inc(f[i], fAdd[i]);
223     }
224 }
225
226 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
227                         tensor vir_part, t_graph *graph, matrix box,
228                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
229 {
230     int    i;
231
232     /* The short-range virial from surrounding boxes */
233     clear_mat(vir_part);
234     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
235     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
236
237     /* Calculate partial virial, for local atoms only, based on short range.
238      * Total virial is computed in global_stat, called from do_md
239      */
240     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
241     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
242
243     /* Add position restraint contribution */
244     for (i = 0; i < DIM; i++)
245     {
246         vir_part[i][i] += fr->vir_diag_posres[i];
247     }
248
249     /* Add wall contribution */
250     for (i = 0; i < DIM; i++)
251     {
252         vir_part[i][ZZ] += fr->vir_wall_z[i];
253     }
254
255     if (debug)
256     {
257         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
258     }
259 }
260
261 static void pull_potential_wrapper(t_commrec *cr,
262                                    t_inputrec *ir,
263                                    matrix box, rvec x[],
264                                    rvec f[],
265                                    tensor vir_force,
266                                    t_mdatoms *mdatoms,
267                                    gmx_enerdata_t *enerd,
268                                    real *lambda,
269                                    double t,
270                                    gmx_wallcycle_t wcycle)
271 {
272     t_pbc  pbc;
273     real   dvdl;
274
275     /* Calculate the center of mass forces, this requires communication,
276      * which is why pull_potential is called close to other communication.
277      * The virial contribution is calculated directly,
278      * which is why we call pull_potential after calc_virial.
279      */
280     wallcycle_start(wcycle, ewcPULLPOT);
281     set_pbc(&pbc, ir->ePBC, box);
282     dvdl                     = 0;
283     enerd->term[F_COM_PULL] +=
284         pull_potential(ir->pull_work, mdatoms, &pbc,
285                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
286     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
287     wallcycle_stop(wcycle, ewcPULLPOT);
288 }
289
290 static void pme_receive_force_ener(t_commrec      *cr,
291                                    gmx_wallcycle_t wcycle,
292                                    gmx_enerdata_t *enerd,
293                                    t_forcerec     *fr)
294 {
295     real   e_q, e_lj, dvdl_q, dvdl_lj;
296     float  cycles_ppdpme, cycles_seppme;
297
298     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
299     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
300
301     /* In case of node-splitting, the PP nodes receive the long-range
302      * forces, virial and energy from the PME nodes here.
303      */
304     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
305     dvdl_q  = 0;
306     dvdl_lj = 0;
307     gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
308                       fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
309                       &cycles_seppme);
310     enerd->term[F_COUL_RECIP] += e_q;
311     enerd->term[F_LJ_RECIP]   += e_lj;
312     enerd->dvdl_lin[efptCOUL] += dvdl_q;
313     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
314
315     if (wcycle)
316     {
317         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
318     }
319     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
320 }
321
322 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
323                                gmx_int64_t step, real forceTolerance,
324                                const rvec *x, const rvec *f)
325 {
326     real           force2Tolerance = gmx::square(forceTolerance);
327     std::uintmax_t numNonFinite    = 0;
328     for (int i = 0; i < md->homenr; i++)
329     {
330         real force2    = norm2(f[i]);
331         bool nonFinite = !std::isfinite(force2);
332         if (force2 >= force2Tolerance || nonFinite)
333         {
334             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
335                     step,
336                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
337         }
338         if (nonFinite)
339         {
340             numNonFinite++;
341         }
342     }
343     if (numNonFinite > 0)
344     {
345         /* Note that with MPI this fatal call on one rank might interrupt
346          * the printing on other ranks. But we can only avoid that with
347          * an expensive MPI barrier that we would need at each step.
348          */
349         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
350     }
351 }
352
353 static void post_process_forces(t_commrec *cr,
354                                 gmx_int64_t step,
355                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
356                                 gmx_localtop_t *top,
357                                 matrix box, rvec x[],
358                                 rvec f[],
359                                 tensor vir_force,
360                                 t_mdatoms *mdatoms,
361                                 t_graph *graph,
362                                 t_forcerec *fr, gmx_vsite_t *vsite,
363                                 int flags)
364 {
365     if (fr->bF_NoVirSum)
366     {
367         if (vsite)
368         {
369             /* Spread the mesh force on virtual sites to the other particles...
370              * This is parallellized. MPI communication is performed
371              * if the constructing atoms aren't local.
372              */
373             wallcycle_start(wcycle, ewcVSITESPREAD);
374             spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
375                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
376                            nrnb,
377                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
378             wallcycle_stop(wcycle, ewcVSITESPREAD);
379         }
380         if (flags & GMX_FORCE_VIRIAL)
381         {
382             /* Now add the forces, this is local */
383             sum_forces(f, fr->f_novirsum);
384
385             if (EEL_FULL(fr->eeltype))
386             {
387                 /* Add the mesh contribution to the virial */
388                 m_add(vir_force, fr->vir_el_recip, vir_force);
389             }
390             if (EVDW_PME(fr->vdwtype))
391             {
392                 /* Add the mesh contribution to the virial */
393                 m_add(vir_force, fr->vir_lj_recip, vir_force);
394             }
395             if (debug)
396             {
397                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
398             }
399         }
400     }
401
402     if (fr->print_force >= 0)
403     {
404         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
405     }
406 }
407
408 static void do_nb_verlet(t_forcerec *fr,
409                          interaction_const_t *ic,
410                          gmx_enerdata_t *enerd,
411                          int flags, int ilocality,
412                          int clearF,
413                          gmx_int64_t step,
414                          t_nrnb *nrnb,
415                          gmx_wallcycle_t wcycle)
416 {
417     if (!(flags & GMX_FORCE_NONBONDED))
418     {
419         /* skip non-bonded calculation */
420         return;
421     }
422
423     nonbonded_verlet_t       *nbv  = fr->nbv;
424     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
425
426     /* GPU kernel launch overhead is already timed separately */
427     if (fr->cutoff_scheme != ecutsVERLET)
428     {
429         gmx_incons("Invalid cut-off scheme passed!");
430     }
431
432     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
433
434     if (!bUsingGpuKernels)
435     {
436         /* When dynamic pair-list  pruning is requested, we need to prune
437          * at nstlistPrune steps.
438          */
439         if (nbv->listParams->useDynamicPruning &&
440             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
441         {
442             /* Prune the pair-list beyond fr->ic->rlistPrune using
443              * the current coordinates of the atoms.
444              */
445             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
446             GMX_RELEASE_ASSERT(false, "The CPU prune kernel will be called here");
447             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
448         }
449
450         wallcycle_sub_start(wcycle, ewcsNONBONDED);
451     }
452
453     switch (nbvg->kernel_type)
454     {
455         case nbnxnk4x4_PlainC:
456         case nbnxnk4xN_SIMD_4xN:
457         case nbnxnk4xN_SIMD_2xNN:
458             nbnxn_kernel_cpu(nbvg,
459                              ic,
460                              fr->shift_vec,
461                              flags,
462                              clearF,
463                              fr->fshift[0],
464                              enerd->grpp.ener[egCOULSR],
465                              fr->bBHAM ?
466                              enerd->grpp.ener[egBHAMSR] :
467                              enerd->grpp.ener[egLJSR]);
468             break;
469
470         case nbnxnk8x8x8_GPU:
471             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
472             break;
473
474         case nbnxnk8x8x8_PlainC:
475             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
476                                  nbvg->nbat, ic,
477                                  fr->shift_vec,
478                                  flags,
479                                  clearF,
480                                  nbvg->nbat->out[0].f,
481                                  fr->fshift[0],
482                                  enerd->grpp.ener[egCOULSR],
483                                  fr->bBHAM ?
484                                  enerd->grpp.ener[egBHAMSR] :
485                                  enerd->grpp.ener[egLJSR]);
486             break;
487
488         default:
489             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
490
491     }
492     if (!bUsingGpuKernels)
493     {
494         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
495     }
496
497     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
498     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
499     {
500         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
501     }
502     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
503              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
504     {
505         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
506     }
507     else
508     {
509         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
510     }
511     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
512     if (flags & GMX_FORCE_ENERGY)
513     {
514         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
515         enr_nbnxn_kernel_ljc += 1;
516         enr_nbnxn_kernel_lj  += 1;
517     }
518
519     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
520              nbvg->nbl_lists.natpair_ljq);
521     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
522              nbvg->nbl_lists.natpair_lj);
523     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
524     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
525              nbvg->nbl_lists.natpair_q);
526
527     if (ic->vdw_modifier == eintmodFORCESWITCH)
528     {
529         /* We add up the switch cost separately */
530         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
531                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
532     }
533     if (ic->vdw_modifier == eintmodPOTSWITCH)
534     {
535         /* We add up the switch cost separately */
536         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
537                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
538     }
539     if (ic->vdwtype == evdwPME)
540     {
541         /* We add up the LJ Ewald cost separately */
542         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
543                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
544     }
545 }
546
547 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
548                              t_forcerec           *fr,
549                              rvec                  x[],
550                              rvec                  f[],
551                              t_mdatoms            *mdatoms,
552                              t_lambda             *fepvals,
553                              real                 *lambda,
554                              gmx_enerdata_t       *enerd,
555                              int                   flags,
556                              t_nrnb               *nrnb,
557                              gmx_wallcycle_t       wcycle)
558 {
559     int              donb_flags;
560     nb_kernel_data_t kernel_data;
561     real             lam_i[efptNR];
562     real             dvdl_nb[efptNR];
563     int              th;
564     int              i, j;
565
566     donb_flags = 0;
567     /* Add short-range interactions */
568     donb_flags |= GMX_NONBONDED_DO_SR;
569
570     /* Currently all group scheme kernels always calculate (shift-)forces */
571     if (flags & GMX_FORCE_FORCES)
572     {
573         donb_flags |= GMX_NONBONDED_DO_FORCE;
574     }
575     if (flags & GMX_FORCE_VIRIAL)
576     {
577         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
578     }
579     if (flags & GMX_FORCE_ENERGY)
580     {
581         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
582     }
583
584     kernel_data.flags  = donb_flags;
585     kernel_data.lambda = lambda;
586     kernel_data.dvdl   = dvdl_nb;
587
588     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
589     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
590
591     /* reset free energy components */
592     for (i = 0; i < efptNR; i++)
593     {
594         dvdl_nb[i]  = 0;
595     }
596
597     assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
598
599     wallcycle_sub_start(wcycle, ewcsNONBONDED);
600 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
601     for (th = 0; th < nbl_lists->nnbl; th++)
602     {
603         try
604         {
605             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
606                                       x, f, fr, mdatoms, &kernel_data, nrnb);
607         }
608         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
609     }
610
611     if (fepvals->sc_alpha != 0)
612     {
613         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
614         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
615     }
616     else
617     {
618         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
619         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
620     }
621
622     /* If we do foreign lambda and we have soft-core interactions
623      * we have to recalculate the (non-linear) energies contributions.
624      */
625     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
626     {
627         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
628         kernel_data.lambda         = lam_i;
629         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
630         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
631         /* Note that we add to kernel_data.dvdl, but ignore the result */
632
633         for (i = 0; i < enerd->n_lambda; i++)
634         {
635             for (j = 0; j < efptNR; j++)
636             {
637                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
638             }
639             reset_foreign_enerdata(enerd);
640 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
641             for (th = 0; th < nbl_lists->nnbl; th++)
642             {
643                 try
644                 {
645                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
646                                               x, f, fr, mdatoms, &kernel_data, nrnb);
647                 }
648                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
649             }
650
651             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
652             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
653         }
654     }
655
656     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
657 }
658
659 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
660 {
661     return nbv != nullptr && nbv->bUseGPU;
662 }
663
664 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
665 {
666     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
667
668     /* Note that we would like to avoid this conditional by putting it
669      * into the omp pragma instead, but then we still take the full
670      * omp parallel for overhead (at least with gcc5).
671      */
672     if (nth == 1)
673     {
674         for (int i = 0; i < n; i++)
675         {
676             clear_rvec(v[i]);
677         }
678     }
679     else
680     {
681 #pragma omp parallel for num_threads(nth) schedule(static)
682         for (int i = 0; i < n; i++)
683         {
684             clear_rvec(v[i]);
685         }
686     }
687 }
688
689 /*! \brief  This routine checks if the potential energy is finite.
690  *
691  * Note that passing this check does not guarantee finite forces,
692  * since those use slightly different arithmetics. But in most cases
693  * there is just a narrow coordinate range where forces are not finite
694  * and energies are finite.
695  *
696  * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
697  */
698 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
699 {
700     if (!std::isfinite(enerd->term[F_EPOT]))
701     {
702         gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
703                   enerd->term[F_EPOT],
704                   enerd->term[F_LJ],
705                   enerd->term[F_COUL_SR]);
706     }
707 }
708
709 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
710                          t_inputrec *inputrec,
711                          gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
712                          gmx_localtop_t *top,
713                          gmx_groups_t gmx_unused *groups,
714                          matrix box, rvec x[], history_t *hist,
715                          PaddedRVecVector *force,
716                          tensor vir_force,
717                          t_mdatoms *mdatoms,
718                          gmx_enerdata_t *enerd, t_fcdata *fcd,
719                          real *lambda, t_graph *graph,
720                          t_forcerec *fr, interaction_const_t *ic,
721                          gmx_vsite_t *vsite, rvec mu_tot,
722                          double t, gmx_edsam_t ed,
723                          gmx_bool bBornRadii,
724                          int flags,
725                          DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
726                          DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
727 {
728     int                 cg1, i, j;
729     double              mu[2*DIM];
730     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
731     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
732     gmx_bool            bDiffKernels = FALSE;
733     rvec                vzero, box_diag;
734     float               cycles_pme, cycles_wait_gpu;
735     nonbonded_verlet_t *nbv = fr->nbv;
736
737     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
738     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
739     bFillGrid     = (bNS && bStateChanged);
740     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
741     bDoForces     = (flags & GMX_FORCE_FORCES);
742     bUseGPU       = fr->nbv->bUseGPU;
743     bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu;
744
745     /* At a search step we need to start the first balancing region
746      * somewhere early inside the step after communication during domain
747      * decomposition (and not during the previous step as usual).
748      */
749     if (bNS &&
750         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
751     {
752         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
753     }
754
755     cycles_wait_gpu = 0;
756
757     const int start  = 0;
758     const int homenr = mdatoms->homenr;
759
760     clear_mat(vir_force);
761
762     if (DOMAINDECOMP(cr))
763     {
764         cg1 = cr->dd->ncg_tot;
765     }
766     else
767     {
768         cg1 = top->cgs.nr;
769     }
770     if (fr->n_tpi > 0)
771     {
772         cg1--;
773     }
774
775     if (bStateChanged)
776     {
777         update_forcerec(fr, box);
778
779         if (inputrecNeedMutot(inputrec))
780         {
781             /* Calculate total (local) dipole moment in a temporary common array.
782              * This makes it possible to sum them over nodes faster.
783              */
784             calc_mu(start, homenr,
785                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
786                     mu, mu+DIM);
787         }
788     }
789
790     if (fr->ePBC != epbcNONE)
791     {
792         /* Compute shift vectors every step,
793          * because of pressure coupling or box deformation!
794          */
795         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
796         {
797             calc_shifts(box, fr->shift_vec);
798         }
799
800         if (bCalcCGCM)
801         {
802             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
803             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
804         }
805         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
806         {
807             unshift_self(graph, box, x);
808         }
809     }
810
811     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
812                                  fr->shift_vec, nbv->grp[0].nbat);
813
814 #if GMX_MPI
815     if (!(cr->duty & DUTY_PME))
816     {
817         gmx_bool bBS;
818         matrix   boxs;
819
820         /* Send particle coordinates to the pme nodes.
821          * Since this is only implemented for domain decomposition
822          * and domain decomposition does not use the graph,
823          * we do not need to worry about shifting.
824          */
825
826         wallcycle_start(wcycle, ewcPP_PMESENDX);
827
828         bBS = (inputrec->nwall == 2);
829         if (bBS)
830         {
831             copy_mat(box, boxs);
832             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
833         }
834
835         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
836                                  lambda[efptCOUL], lambda[efptVDW],
837                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
838                                  step);
839
840         wallcycle_stop(wcycle, ewcPP_PMESENDX);
841     }
842 #endif /* GMX_MPI */
843
844     /* do gridding for pair search */
845     if (bNS)
846     {
847         if (graph && bStateChanged)
848         {
849             /* Calculate intramolecular shift vectors to make molecules whole */
850             mk_mshift(fplog, graph, fr->ePBC, box, x);
851         }
852
853         clear_rvec(vzero);
854         box_diag[XX] = box[XX][XX];
855         box_diag[YY] = box[YY][YY];
856         box_diag[ZZ] = box[ZZ][ZZ];
857
858         wallcycle_start(wcycle, ewcNS);
859         if (!fr->bDomDec)
860         {
861             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
862             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
863                               0, vzero, box_diag,
864                               0, mdatoms->homenr, -1, fr->cginfo, x,
865                               0, nullptr,
866                               nbv->grp[eintLocal].kernel_type,
867                               nbv->grp[eintLocal].nbat);
868             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
869         }
870         else
871         {
872             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
873             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
874                                        fr->cginfo, x,
875                                        nbv->grp[eintNonlocal].kernel_type,
876                                        nbv->grp[eintNonlocal].nbat);
877             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
878         }
879
880         if (nbv->ngrp == 1 ||
881             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
882         {
883             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
884                                nbv->nbs, mdatoms, fr->cginfo);
885         }
886         else
887         {
888             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
889                                nbv->nbs, mdatoms, fr->cginfo);
890             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
891                                nbv->nbs, mdatoms, fr->cginfo);
892         }
893         wallcycle_stop(wcycle, ewcNS);
894     }
895
896     /* initialize the GPU atom data and copy shift vector */
897     if (bUseGPU)
898     {
899         if (bNS)
900         {
901             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
902             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
903             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
904         }
905
906         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
907         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
908         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
909     }
910
911     /* do local pair search */
912     if (bNS)
913     {
914         wallcycle_start_nocount(wcycle, ewcNS);
915         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
916         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
917                             &top->excls,
918                             nbv->listParams->rlistOuter,
919                             nbv->min_ci_balanced,
920                             &nbv->grp[eintLocal].nbl_lists,
921                             eintLocal,
922                             nbv->grp[eintLocal].kernel_type,
923                             nrnb);
924         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
925         if (nbv->listParams->useDynamicPruning && !bUseGPU)
926         {
927             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
928         }
929         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
930
931         if (bUseGPU)
932         {
933             /* initialize local pair-list on the GPU */
934             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
935                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
936                                     eintLocal);
937         }
938         wallcycle_stop(wcycle, ewcNS);
939     }
940     else
941     {
942         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
943         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
944         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
945                                         nbv->grp[eintLocal].nbat);
946         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
947         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
948     }
949
950     if (bUseGPU)
951     {
952         if (DOMAINDECOMP(cr))
953         {
954             ddOpenBalanceRegionGpu(cr->dd);
955         }
956
957         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
958         /* launch local nonbonded F on GPU */
959         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
960                      step, nrnb, wcycle);
961         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
962     }
963
964     /* Communicate coordinates and sum dipole if necessary +
965        do non-local pair search */
966     if (DOMAINDECOMP(cr))
967     {
968         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
969                         nbv->grp[eintLocal].kernel_type);
970
971         if (bDiffKernels)
972         {
973             /* With GPU+CPU non-bonded calculations we need to copy
974              * the local coordinates to the non-local nbat struct
975              * (in CPU format) as the non-local kernel call also
976              * calculates the local - non-local interactions.
977              */
978             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
979             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
980             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
981                                             nbv->grp[eintNonlocal].nbat);
982             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
983             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
984         }
985
986         if (bNS)
987         {
988             wallcycle_start_nocount(wcycle, ewcNS);
989             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
990
991             if (bDiffKernels)
992             {
993                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
994             }
995
996             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
997                                 &top->excls,
998                                 nbv->listParams->rlistOuter,
999                                 nbv->min_ci_balanced,
1000                                 &nbv->grp[eintNonlocal].nbl_lists,
1001                                 eintNonlocal,
1002                                 nbv->grp[eintNonlocal].kernel_type,
1003                                 nrnb);
1004             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1005             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1006             {
1007                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1008             }
1009             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1010
1011             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1012             {
1013                 /* initialize non-local pair-list on the GPU */
1014                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1015                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1016                                         eintNonlocal);
1017             }
1018             wallcycle_stop(wcycle, ewcNS);
1019         }
1020         else
1021         {
1022             wallcycle_start(wcycle, ewcMOVEX);
1023             dd_move_x(cr->dd, box, x);
1024             wallcycle_stop(wcycle, ewcMOVEX);
1025
1026             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1027             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1028             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1029                                             nbv->grp[eintNonlocal].nbat);
1030             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1031             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1032         }
1033
1034         if (bUseGPU && !bDiffKernels)
1035         {
1036             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1037             /* launch non-local nonbonded F on GPU */
1038             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1039                          step, nrnb, wcycle);
1040             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1041         }
1042     }
1043
1044     if (bUseGPU)
1045     {
1046         /* launch D2H copy-back F */
1047         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1048         if (DOMAINDECOMP(cr) && !bDiffKernels)
1049         {
1050             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1051                                      flags, eatNonlocal);
1052         }
1053         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1054                                  flags, eatLocal);
1055         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1056     }
1057
1058     if (bStateChanged && inputrecNeedMutot(inputrec))
1059     {
1060         if (PAR(cr))
1061         {
1062             gmx_sumd(2*DIM, mu, cr);
1063             ddReopenBalanceRegionCpu(cr->dd);
1064         }
1065
1066         for (i = 0; i < 2; i++)
1067         {
1068             for (j = 0; j < DIM; j++)
1069             {
1070                 fr->mu_tot[i][j] = mu[i*DIM + j];
1071             }
1072         }
1073     }
1074     if (fr->efep == efepNO)
1075     {
1076         copy_rvec(fr->mu_tot[0], mu_tot);
1077     }
1078     else
1079     {
1080         for (j = 0; j < DIM; j++)
1081         {
1082             mu_tot[j] =
1083                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1084                 lambda[efptCOUL]*fr->mu_tot[1][j];
1085         }
1086     }
1087
1088     /* Reset energies */
1089     reset_enerdata(enerd);
1090     clear_rvecs(SHIFTS, fr->fshift);
1091
1092     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1093     {
1094         wallcycle_start(wcycle, ewcPPDURINGPME);
1095         dd_force_flop_start(cr->dd, nrnb);
1096     }
1097
1098     if (inputrec->bRot)
1099     {
1100         wallcycle_start(wcycle, ewcROT);
1101         do_rotation(cr, inputrec, box, x, t, step, bNS);
1102         wallcycle_stop(wcycle, ewcROT);
1103     }
1104
1105     /* Temporary solution until all routines take PaddedRVecVector */
1106     rvec *f = as_rvec_array(force->data());
1107
1108     /* Start the force cycle counter.
1109      * Note that a different counter is used for dynamic load balancing.
1110      */
1111     wallcycle_start(wcycle, ewcFORCE);
1112     if (bDoForces)
1113     {
1114         /* Reset forces for which the virial is calculated separately:
1115          * PME/Ewald forces if necessary */
1116         if (fr->bF_NoVirSum)
1117         {
1118             if (flags & GMX_FORCE_VIRIAL)
1119             {
1120                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1121             }
1122             else
1123             {
1124                 /* We are not calculating the pressure so we do not need
1125                  * a separate array for forces that do not contribute
1126                  * to the pressure.
1127                  */
1128                 fr->f_novirsum = force;
1129             }
1130         }
1131
1132         if (fr->bF_NoVirSum)
1133         {
1134             if (flags & GMX_FORCE_VIRIAL)
1135             {
1136                 /* TODO: remove this - 1 when padding is properly implemented */
1137                 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1138                                 as_rvec_array(fr->f_novirsum->data()));
1139             }
1140         }
1141         /* Clear the short- and long-range forces */
1142         clear_rvecs_omp(fr->natoms_force_constr, f);
1143
1144         clear_rvec(fr->vir_diag_posres);
1145     }
1146
1147     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1148     {
1149         clear_pull_forces(inputrec->pull_work);
1150     }
1151
1152     /* We calculate the non-bonded forces, when done on the CPU, here.
1153      * We do this before calling do_force_lowlevel, because in that
1154      * function, the listed forces are calculated before PME, which
1155      * does communication.  With this order, non-bonded and listed
1156      * force calculation imbalance can be balanced out by the domain
1157      * decomposition load balancing.
1158      */
1159
1160     if (!bUseOrEmulGPU)
1161     {
1162         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1163                      step, nrnb, wcycle);
1164     }
1165
1166     if (fr->efep != efepNO)
1167     {
1168         /* Calculate the local and non-local free energy interactions here.
1169          * Happens here on the CPU both with and without GPU.
1170          */
1171         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1172         {
1173             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1174                              fr, x, f, mdatoms,
1175                              inputrec->fepvals, lambda,
1176                              enerd, flags, nrnb, wcycle);
1177         }
1178
1179         if (DOMAINDECOMP(cr) &&
1180             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1181         {
1182             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1183                              fr, x, f, mdatoms,
1184                              inputrec->fepvals, lambda,
1185                              enerd, flags, nrnb, wcycle);
1186         }
1187     }
1188
1189     if (!bUseOrEmulGPU || bDiffKernels)
1190     {
1191         int aloc;
1192
1193         if (DOMAINDECOMP(cr))
1194         {
1195             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1196                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1197                          step, nrnb, wcycle);
1198         }
1199
1200         if (!bUseOrEmulGPU)
1201         {
1202             aloc = eintLocal;
1203         }
1204         else
1205         {
1206             aloc = eintNonlocal;
1207         }
1208
1209         /* Add all the non-bonded force to the normal force array.
1210          * This can be split into a local and a non-local part when overlapping
1211          * communication with calculation with domain decomposition.
1212          */
1213         wallcycle_stop(wcycle, ewcFORCE);
1214         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1215         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1216         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1217         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1218         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1219         wallcycle_start_nocount(wcycle, ewcFORCE);
1220
1221         /* if there are multiple fshift output buffers reduce them */
1222         if ((flags & GMX_FORCE_VIRIAL) &&
1223             nbv->grp[aloc].nbl_lists.nnbl > 1)
1224         {
1225             /* This is not in a subcounter because it takes a
1226                negligible and constant-sized amount of time */
1227             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1228                                                      fr->fshift);
1229         }
1230     }
1231
1232     /* update QMMMrec, if necessary */
1233     if (fr->bQMMM)
1234     {
1235         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1236     }
1237
1238     /* Compute the bonded and non-bonded energies and optionally forces */
1239     do_force_lowlevel(fr, inputrec, &(top->idef),
1240                       cr, nrnb, wcycle, mdatoms,
1241                       x, hist, f, enerd, fcd, top, fr->born,
1242                       bBornRadii, box,
1243                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1244                       flags, &cycles_pme);
1245
1246     wallcycle_stop(wcycle, ewcFORCE);
1247
1248     if (ed)
1249     {
1250         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1251     }
1252
1253     if (bUseOrEmulGPU && !bDiffKernels)
1254     {
1255         /* wait for non-local forces (or calculate in emulation mode) */
1256         if (DOMAINDECOMP(cr))
1257         {
1258             if (bUseGPU)
1259             {
1260                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1261                 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1262                                        flags, eatNonlocal,
1263                                        enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1264                                        fr->fshift);
1265                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1266             }
1267             else
1268             {
1269                 wallcycle_start_nocount(wcycle, ewcFORCE);
1270                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1271                              step, nrnb, wcycle);
1272                 wallcycle_stop(wcycle, ewcFORCE);
1273             }
1274             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1275             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1276             /* skip the reduction if there was no non-local work to do */
1277             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1278             {
1279                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1280                                                nbv->grp[eintNonlocal].nbat, f);
1281             }
1282             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1283             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1284         }
1285     }
1286
1287     if (DOMAINDECOMP(cr))
1288     {
1289         /* We are done with the CPU compute.
1290          * We will now communicate the non-local forces.
1291          * If we use a GPU this will overlap with GPU work, so in that case
1292          * we do not close the DD force balancing region here.
1293          */
1294         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1295         {
1296             ddCloseBalanceRegionCpu(cr->dd);
1297         }
1298         if (bDoForces)
1299         {
1300             wallcycle_start(wcycle, ewcMOVEF);
1301             dd_move_f(cr->dd, f, fr->fshift);
1302             wallcycle_stop(wcycle, ewcMOVEF);
1303         }
1304     }
1305
1306     if (bUseOrEmulGPU)
1307     {
1308         /* wait for local forces (or calculate in emulation mode) */
1309         if (bUseGPU)
1310         {
1311             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1312              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1313              * but even with a step of 0.1 ms the difference is less than 1%
1314              * of the step time.
1315              */
1316             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1317
1318             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1319             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1320                                    flags, eatLocal,
1321                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1322                                    fr->fshift);
1323             float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1324
1325             if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1326             {
1327                 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1328                 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1329                 {
1330                     /* We measured few cycles, it could be that the kernel
1331                      * and transfer finished earlier and there was no actual
1332                      * wait time, only API call overhead.
1333                      * Then the actual time could be anywhere between 0 and
1334                      * cycles_wait_est. We will use half of cycles_wait_est.
1335                      */
1336                     waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1337                 }
1338                 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1339             }
1340
1341             /* now clear the GPU outputs while we finish the step on the CPU */
1342             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1343             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1344
1345             /* Is dynamic pair-list pruning activated? */
1346             if (nbv->listParams->useDynamicPruning)
1347             {
1348                 /* We should not launch the rolling pruning kernel at a search
1349                  * step or just before search steps, since that's useless.
1350                  * Without domain decomposition we prune at even steps.
1351                  * With domain decomposition we alternate local and non-local
1352                  * pruning at even and odd steps.
1353                  */
1354                 int  numRollingParts     = nbv->listParams->numRollingParts;
1355                 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1356                 int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1357                 bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1358                 if (stepWithCurrentList > 0 &&
1359                     stepWithCurrentList < inputrec->nstlist - 1 &&
1360                     (stepIsEven || DOMAINDECOMP(cr)))
1361                 {
1362                     GMX_RELEASE_ASSERT(false, "The GPU prune kernel will be called here");
1363                 }
1364             }
1365             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1366         }
1367         else
1368         {
1369             wallcycle_start_nocount(wcycle, ewcFORCE);
1370             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1371                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1372                          step, nrnb, wcycle);
1373             wallcycle_stop(wcycle, ewcFORCE);
1374         }
1375         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1376         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1377         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1378                                        nbv->grp[eintLocal].nbat, f);
1379         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1380         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1381     }
1382
1383     if (DOMAINDECOMP(cr))
1384     {
1385         dd_force_flop_stop(cr->dd, nrnb);
1386     }
1387
1388     if (bDoForces)
1389     {
1390         /* Collect forces from modules */
1391         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1392         if (fr->bF_NoVirSum)
1393         {
1394             fNoVirSum = *fr->f_novirsum;
1395         }
1396         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1397
1398         /* If we have NoVirSum forces, but we do not calculate the virial,
1399          * we sum fr->f_novirsum=f later.
1400          */
1401         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1402         {
1403             wallcycle_start(wcycle, ewcVSITESPREAD);
1404             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1405                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1406             wallcycle_stop(wcycle, ewcVSITESPREAD);
1407         }
1408
1409         if (flags & GMX_FORCE_VIRIAL)
1410         {
1411             /* Calculation of the virial must be done after vsites! */
1412             calc_virial(0, mdatoms->homenr, x, f,
1413                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1414         }
1415     }
1416
1417     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1418     {
1419         /* Since the COM pulling is always done mass-weighted, no forces are
1420          * applied to vsites and this call can be done after vsite spreading.
1421          */
1422         pull_potential_wrapper(cr, inputrec, box, x,
1423                                f, vir_force, mdatoms, enerd, lambda, t,
1424                                wcycle);
1425     }
1426
1427     /* Add the forces from enforced rotation potentials (if any) */
1428     if (inputrec->bRot)
1429     {
1430         wallcycle_start(wcycle, ewcROTadd);
1431         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1432         wallcycle_stop(wcycle, ewcROTadd);
1433     }
1434
1435     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1436     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1437
1438     if (PAR(cr) && !(cr->duty & DUTY_PME))
1439     {
1440         /* In case of node-splitting, the PP nodes receive the long-range
1441          * forces, virial and energy from the PME nodes here.
1442          */
1443         pme_receive_force_ener(cr, wcycle, enerd, fr);
1444     }
1445
1446     if (bDoForces)
1447     {
1448         post_process_forces(cr, step, nrnb, wcycle,
1449                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1450                             flags);
1451     }
1452
1453     if (flags & GMX_FORCE_ENERGY)
1454     {
1455         /* Sum the potential energy terms from group contributions */
1456         sum_epot(&(enerd->grpp), enerd->term);
1457
1458         if (!EI_TPI(inputrec->eI))
1459         {
1460             checkPotentialEnergyValidity(enerd);
1461         }
1462     }
1463 }
1464
1465 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1466                         t_inputrec *inputrec,
1467                         gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1468                         gmx_localtop_t *top,
1469                         gmx_groups_t *groups,
1470                         matrix box, rvec x[], history_t *hist,
1471                         PaddedRVecVector *force,
1472                         tensor vir_force,
1473                         t_mdatoms *mdatoms,
1474                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1475                         real *lambda, t_graph *graph,
1476                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1477                         double t, gmx_edsam_t ed,
1478                         gmx_bool bBornRadii,
1479                         int flags,
1480                         DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1481                         DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1482 {
1483     int        cg0, cg1, i, j;
1484     double     mu[2*DIM];
1485     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1486     gmx_bool   bDoForces;
1487     float      cycles_pme;
1488
1489     const int  start  = 0;
1490     const int  homenr = mdatoms->homenr;
1491
1492     clear_mat(vir_force);
1493
1494     cg0 = 0;
1495     if (DOMAINDECOMP(cr))
1496     {
1497         cg1 = cr->dd->ncg_tot;
1498     }
1499     else
1500     {
1501         cg1 = top->cgs.nr;
1502     }
1503     if (fr->n_tpi > 0)
1504     {
1505         cg1--;
1506     }
1507
1508     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1509     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1510     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1511     bFillGrid      = (bNS && bStateChanged);
1512     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1513     bDoForces      = (flags & GMX_FORCE_FORCES);
1514
1515     if (bStateChanged)
1516     {
1517         update_forcerec(fr, box);
1518
1519         if (inputrecNeedMutot(inputrec))
1520         {
1521             /* Calculate total (local) dipole moment in a temporary common array.
1522              * This makes it possible to sum them over nodes faster.
1523              */
1524             calc_mu(start, homenr,
1525                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1526                     mu, mu+DIM);
1527         }
1528     }
1529
1530     if (fr->ePBC != epbcNONE)
1531     {
1532         /* Compute shift vectors every step,
1533          * because of pressure coupling or box deformation!
1534          */
1535         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1536         {
1537             calc_shifts(box, fr->shift_vec);
1538         }
1539
1540         if (bCalcCGCM)
1541         {
1542             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1543                                      &(top->cgs), x, fr->cg_cm);
1544             inc_nrnb(nrnb, eNR_CGCM, homenr);
1545             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1546         }
1547         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1548         {
1549             unshift_self(graph, box, x);
1550         }
1551     }
1552     else if (bCalcCGCM)
1553     {
1554         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1555         inc_nrnb(nrnb, eNR_CGCM, homenr);
1556     }
1557
1558     if (bCalcCGCM && gmx_debug_at)
1559     {
1560         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1561     }
1562
1563 #if GMX_MPI
1564     if (!(cr->duty & DUTY_PME))
1565     {
1566         gmx_bool bBS;
1567         matrix   boxs;
1568
1569         /* Send particle coordinates to the pme nodes.
1570          * Since this is only implemented for domain decomposition
1571          * and domain decomposition does not use the graph,
1572          * we do not need to worry about shifting.
1573          */
1574
1575         wallcycle_start(wcycle, ewcPP_PMESENDX);
1576
1577         bBS = (inputrec->nwall == 2);
1578         if (bBS)
1579         {
1580             copy_mat(box, boxs);
1581             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1582         }
1583
1584         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1585                                  lambda[efptCOUL], lambda[efptVDW],
1586                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1587                                  step);
1588
1589         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1590     }
1591 #endif /* GMX_MPI */
1592
1593     /* Communicate coordinates and sum dipole if necessary */
1594     if (DOMAINDECOMP(cr))
1595     {
1596         wallcycle_start(wcycle, ewcMOVEX);
1597         dd_move_x(cr->dd, box, x);
1598         wallcycle_stop(wcycle, ewcMOVEX);
1599         /* No GPU support, no move_x overlap, so reopen the balance region here */
1600         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1601         {
1602             ddReopenBalanceRegionCpu(cr->dd);
1603         }
1604     }
1605
1606     if (inputrecNeedMutot(inputrec))
1607     {
1608         if (bStateChanged)
1609         {
1610             if (PAR(cr))
1611             {
1612                 gmx_sumd(2*DIM, mu, cr);
1613                 ddReopenBalanceRegionCpu(cr->dd);
1614             }
1615             for (i = 0; i < 2; i++)
1616             {
1617                 for (j = 0; j < DIM; j++)
1618                 {
1619                     fr->mu_tot[i][j] = mu[i*DIM + j];
1620                 }
1621             }
1622         }
1623         if (fr->efep == efepNO)
1624         {
1625             copy_rvec(fr->mu_tot[0], mu_tot);
1626         }
1627         else
1628         {
1629             for (j = 0; j < DIM; j++)
1630             {
1631                 mu_tot[j] =
1632                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1633             }
1634         }
1635     }
1636
1637     /* Reset energies */
1638     reset_enerdata(enerd);
1639     clear_rvecs(SHIFTS, fr->fshift);
1640
1641     if (bNS)
1642     {
1643         wallcycle_start(wcycle, ewcNS);
1644
1645         if (graph && bStateChanged)
1646         {
1647             /* Calculate intramolecular shift vectors to make molecules whole */
1648             mk_mshift(fplog, graph, fr->ePBC, box, x);
1649         }
1650
1651         /* Do the actual neighbour searching */
1652         ns(fplog, fr, box,
1653            groups, top, mdatoms,
1654            cr, nrnb, bFillGrid);
1655
1656         wallcycle_stop(wcycle, ewcNS);
1657     }
1658
1659     if (inputrec->implicit_solvent && bNS)
1660     {
1661         make_gb_nblist(cr, inputrec->gb_algorithm,
1662                        x, box, fr, &top->idef, graph, fr->born);
1663     }
1664
1665     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1666     {
1667         wallcycle_start(wcycle, ewcPPDURINGPME);
1668         dd_force_flop_start(cr->dd, nrnb);
1669     }
1670
1671     if (inputrec->bRot)
1672     {
1673         wallcycle_start(wcycle, ewcROT);
1674         do_rotation(cr, inputrec, box, x, t, step, bNS);
1675         wallcycle_stop(wcycle, ewcROT);
1676     }
1677
1678     /* Temporary solution until all routines take PaddedRVecVector */
1679     rvec *f = as_rvec_array(force->data());
1680
1681     /* Start the force cycle counter.
1682      * Note that a different counter is used for dynamic load balancing.
1683      */
1684     wallcycle_start(wcycle, ewcFORCE);
1685
1686     if (bDoForces)
1687     {
1688         /* Reset forces for which the virial is calculated separately:
1689          * PME/Ewald forces if necessary */
1690         if (fr->bF_NoVirSum)
1691         {
1692             if (flags & GMX_FORCE_VIRIAL)
1693             {
1694                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1695                 /* TODO: remove this - 1 when padding is properly implemented */
1696                 clear_rvecs(fr->f_novirsum->size() - 1,
1697                             as_rvec_array(fr->f_novirsum->data()));
1698             }
1699             else
1700             {
1701                 /* We are not calculating the pressure so we do not need
1702                  * a separate array for forces that do not contribute
1703                  * to the pressure.
1704                  */
1705                 fr->f_novirsum = force;
1706             }
1707         }
1708
1709         /* Clear the short- and long-range forces */
1710         clear_rvecs(fr->natoms_force_constr, f);
1711
1712         clear_rvec(fr->vir_diag_posres);
1713     }
1714     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1715     {
1716         clear_pull_forces(inputrec->pull_work);
1717     }
1718
1719     /* update QMMMrec, if necessary */
1720     if (fr->bQMMM)
1721     {
1722         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1723     }
1724
1725     /* Compute the bonded and non-bonded energies and optionally forces */
1726     do_force_lowlevel(fr, inputrec, &(top->idef),
1727                       cr, nrnb, wcycle, mdatoms,
1728                       x, hist, f, enerd, fcd, top, fr->born,
1729                       bBornRadii, box,
1730                       inputrec->fepvals, lambda,
1731                       graph, &(top->excls), fr->mu_tot,
1732                       flags,
1733                       &cycles_pme);
1734
1735     wallcycle_stop(wcycle, ewcFORCE);
1736
1737     if (DOMAINDECOMP(cr))
1738     {
1739         dd_force_flop_stop(cr->dd, nrnb);
1740
1741         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1742         {
1743             ddCloseBalanceRegionCpu(cr->dd);
1744         }
1745     }
1746
1747     if (ed)
1748     {
1749         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1750     }
1751
1752     if (bDoForces)
1753     {
1754         /* Collect forces from modules */
1755         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1756         if (fr->bF_NoVirSum)
1757         {
1758             fNoVirSum = *fr->f_novirsum;
1759         }
1760         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1761
1762         /* Communicate the forces */
1763         if (DOMAINDECOMP(cr))
1764         {
1765             wallcycle_start(wcycle, ewcMOVEF);
1766             dd_move_f(cr->dd, f, fr->fshift);
1767             /* Do we need to communicate the separate force array
1768              * for terms that do not contribute to the single sum virial?
1769              * Position restraints and electric fields do not introduce
1770              * inter-cg forces, only full electrostatics methods do.
1771              * When we do not calculate the virial, fr->f_novirsum = f,
1772              * so we have already communicated these forces.
1773              */
1774             if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1775                 (flags & GMX_FORCE_VIRIAL))
1776             {
1777                 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1778             }
1779             wallcycle_stop(wcycle, ewcMOVEF);
1780         }
1781
1782         /* If we have NoVirSum forces, but we do not calculate the virial,
1783          * we sum fr->f_novirsum=f later.
1784          */
1785         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1786         {
1787             wallcycle_start(wcycle, ewcVSITESPREAD);
1788             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1789                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1790             wallcycle_stop(wcycle, ewcVSITESPREAD);
1791         }
1792
1793         if (flags & GMX_FORCE_VIRIAL)
1794         {
1795             /* Calculation of the virial must be done after vsites! */
1796             calc_virial(0, mdatoms->homenr, x, f,
1797                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1798         }
1799     }
1800
1801     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1802     {
1803         pull_potential_wrapper(cr, inputrec, box, x,
1804                                f, vir_force, mdatoms, enerd, lambda, t,
1805                                wcycle);
1806     }
1807
1808     /* Add the forces from enforced rotation potentials (if any) */
1809     if (inputrec->bRot)
1810     {
1811         wallcycle_start(wcycle, ewcROTadd);
1812         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1813         wallcycle_stop(wcycle, ewcROTadd);
1814     }
1815
1816     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1817     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1818
1819     if (PAR(cr) && !(cr->duty & DUTY_PME))
1820     {
1821         /* In case of node-splitting, the PP nodes receive the long-range
1822          * forces, virial and energy from the PME nodes here.
1823          */
1824         pme_receive_force_ener(cr, wcycle, enerd, fr);
1825     }
1826
1827     if (bDoForces)
1828     {
1829         post_process_forces(cr, step, nrnb, wcycle,
1830                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1831                             flags);
1832     }
1833
1834     if (flags & GMX_FORCE_ENERGY)
1835     {
1836         /* Sum the potential energy terms from group contributions */
1837         sum_epot(&(enerd->grpp), enerd->term);
1838
1839         if (!EI_TPI(inputrec->eI))
1840         {
1841             checkPotentialEnergyValidity(enerd);
1842         }
1843     }
1844
1845 }
1846
1847 void do_force(FILE *fplog, t_commrec *cr,
1848               t_inputrec *inputrec,
1849               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1850               gmx_localtop_t *top,
1851               gmx_groups_t *groups,
1852               matrix box, PaddedRVecVector *coordinates, history_t *hist,
1853               PaddedRVecVector *force,
1854               tensor vir_force,
1855               t_mdatoms *mdatoms,
1856               gmx_enerdata_t *enerd, t_fcdata *fcd,
1857               gmx::ArrayRef<real> lambda, t_graph *graph,
1858               t_forcerec *fr,
1859               gmx_vsite_t *vsite, rvec mu_tot,
1860               double t, gmx_edsam_t ed,
1861               gmx_bool bBornRadii,
1862               int flags,
1863               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1864               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1865 {
1866     /* modify force flag if not doing nonbonded */
1867     if (!fr->bNonbonded)
1868     {
1869         flags &= ~GMX_FORCE_NONBONDED;
1870     }
1871
1872     GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1873                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1874     GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1875                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1876
1877     rvec *x = as_rvec_array(coordinates->data());
1878
1879     switch (inputrec->cutoff_scheme)
1880     {
1881         case ecutsVERLET:
1882             do_force_cutsVERLET(fplog, cr, inputrec,
1883                                 step, nrnb, wcycle,
1884                                 top,
1885                                 groups,
1886                                 box, x, hist,
1887                                 force, vir_force,
1888                                 mdatoms,
1889                                 enerd, fcd,
1890                                 lambda.data(), graph,
1891                                 fr, fr->ic,
1892                                 vsite, mu_tot,
1893                                 t, ed,
1894                                 bBornRadii,
1895                                 flags,
1896                                 ddOpenBalanceRegion,
1897                                 ddCloseBalanceRegion);
1898             break;
1899         case ecutsGROUP:
1900             do_force_cutsGROUP(fplog, cr, inputrec,
1901                                step, nrnb, wcycle,
1902                                top,
1903                                groups,
1904                                box, x, hist,
1905                                force, vir_force,
1906                                mdatoms,
1907                                enerd, fcd,
1908                                lambda.data(), graph,
1909                                fr, vsite, mu_tot,
1910                                t, ed,
1911                                bBornRadii,
1912                                flags,
1913                                ddOpenBalanceRegion,
1914                                ddCloseBalanceRegion);
1915             break;
1916         default:
1917             gmx_incons("Invalid cut-off scheme passed!");
1918     }
1919
1920     /* In case we don't have constraints and are using GPUs, the next balancing
1921      * region starts here.
1922      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1923      * virial calculation and COM pulling, is not thus not included in
1924      * the balance timing, which is ok as most tasks do communication.
1925      */
1926     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1927     {
1928         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1929     }
1930 }
1931
1932
1933 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1934                         t_inputrec *ir, t_mdatoms *md,
1935                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1936                         t_forcerec *fr, gmx_localtop_t *top)
1937 {
1938     int             i, m, start, end;
1939     gmx_int64_t     step;
1940     real            dt = ir->delta_t;
1941     real            dvdl_dum;
1942     rvec           *savex;
1943
1944     /* We need to allocate one element extra, since we might use
1945      * (unaligned) 4-wide SIMD loads to access rvec entries.
1946      */
1947     snew(savex, state->natoms + 1);
1948
1949     start = 0;
1950     end   = md->homenr;
1951
1952     if (debug)
1953     {
1954         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1955                 start, md->homenr, end);
1956     }
1957     /* Do a first constrain to reset particles... */
1958     step = ir->init_step;
1959     if (fplog)
1960     {
1961         char buf[STEPSTRSIZE];
1962         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1963                 gmx_step_str(step, buf));
1964     }
1965     dvdl_dum = 0;
1966
1967     /* constrain the current position */
1968     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1969               ir, cr, step, 0, 1.0, md,
1970               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1971               fr->bMolPBC, state->box,
1972               state->lambda[efptBONDED], &dvdl_dum,
1973               nullptr, nullptr, nrnb, econqCoord);
1974     if (EI_VV(ir->eI))
1975     {
1976         /* constrain the inital velocity, and save it */
1977         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1978         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1979                   ir, cr, step, 0, 1.0, md,
1980                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1981                   fr->bMolPBC, state->box,
1982                   state->lambda[efptBONDED], &dvdl_dum,
1983                   nullptr, nullptr, nrnb, econqVeloc);
1984     }
1985     /* constrain the inital velocities at t-dt/2 */
1986     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1987     {
1988         for (i = start; (i < end); i++)
1989         {
1990             for (m = 0; (m < DIM); m++)
1991             {
1992                 /* Reverse the velocity */
1993                 state->v[i][m] = -state->v[i][m];
1994                 /* Store the position at t-dt in buf */
1995                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1996             }
1997         }
1998         /* Shake the positions at t=-dt with the positions at t=0
1999          * as reference coordinates.
2000          */
2001         if (fplog)
2002         {
2003             char buf[STEPSTRSIZE];
2004             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2005                     gmx_step_str(step, buf));
2006         }
2007         dvdl_dum = 0;
2008         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2009                   ir, cr, step, -1, 1.0, md,
2010                   as_rvec_array(state->x.data()), savex, nullptr,
2011                   fr->bMolPBC, state->box,
2012                   state->lambda[efptBONDED], &dvdl_dum,
2013                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2014
2015         for (i = start; i < end; i++)
2016         {
2017             for (m = 0; m < DIM; m++)
2018             {
2019                 /* Re-reverse the velocities */
2020                 state->v[i][m] = -state->v[i][m];
2021             }
2022         }
2023     }
2024     sfree(savex);
2025 }
2026
2027
2028 static void
2029 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2030                 double *enerout, double *virout)
2031 {
2032     double enersum, virsum;
2033     double invscale, invscale2, invscale3;
2034     double r, ea, eb, ec, pa, pb, pc, pd;
2035     double y0, f, g, h;
2036     int    ri, offset;
2037     double tabfactor;
2038
2039     invscale  = 1.0/scale;
2040     invscale2 = invscale*invscale;
2041     invscale3 = invscale*invscale2;
2042
2043     /* Following summation derived from cubic spline definition,
2044      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2045      * the cubic spline.  We first calculate the negative of the
2046      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2047      * add the more standard, abrupt cutoff correction to that result,
2048      * yielding the long-range correction for a switched function.  We
2049      * perform both the pressure and energy loops at the same time for
2050      * simplicity, as the computational cost is low. */
2051
2052     if (offstart == 0)
2053     {
2054         /* Since the dispersion table has been scaled down a factor
2055          * 6.0 and the repulsion a factor 12.0 to compensate for the
2056          * c6/c12 parameters inside nbfp[] being scaled up (to save
2057          * flops in kernels), we need to correct for this.
2058          */
2059         tabfactor = 6.0;
2060     }
2061     else
2062     {
2063         tabfactor = 12.0;
2064     }
2065
2066     enersum = 0.0;
2067     virsum  = 0.0;
2068     for (ri = rstart; ri < rend; ++ri)
2069     {
2070         r  = ri*invscale;
2071         ea = invscale3;
2072         eb = 2.0*invscale2*r;
2073         ec = invscale*r*r;
2074
2075         pa = invscale3;
2076         pb = 3.0*invscale2*r;
2077         pc = 3.0*invscale*r*r;
2078         pd = r*r*r;
2079
2080         /* this "8" is from the packing in the vdwtab array - perhaps
2081            should be defined? */
2082
2083         offset = 8*ri + offstart;
2084         y0     = vdwtab[offset];
2085         f      = vdwtab[offset+1];
2086         g      = vdwtab[offset+2];
2087         h      = vdwtab[offset+3];
2088
2089         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2090         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2091     }
2092     *enerout = 4.0*M_PI*enersum*tabfactor;
2093     *virout  = 4.0*M_PI*virsum*tabfactor;
2094 }
2095
2096 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2097 {
2098     double   eners[2], virs[2], enersum, virsum;
2099     double   r0, rc3, rc9;
2100     int      ri0, ri1, i;
2101     real     scale, *vdwtab;
2102
2103     fr->enershiftsix    = 0;
2104     fr->enershifttwelve = 0;
2105     fr->enerdiffsix     = 0;
2106     fr->enerdifftwelve  = 0;
2107     fr->virdiffsix      = 0;
2108     fr->virdifftwelve   = 0;
2109
2110     if (eDispCorr != edispcNO)
2111     {
2112         for (i = 0; i < 2; i++)
2113         {
2114             eners[i] = 0;
2115             virs[i]  = 0;
2116         }
2117         if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2118             (fr->vdw_modifier == eintmodPOTSWITCH) ||
2119             (fr->vdw_modifier == eintmodFORCESWITCH) ||
2120             (fr->vdwtype == evdwSHIFT) ||
2121             (fr->vdwtype == evdwSWITCH))
2122         {
2123             if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2124                  (fr->vdw_modifier == eintmodFORCESWITCH) ||
2125                  (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2126             {
2127                 gmx_fatal(FARGS,
2128                           "With dispersion correction rvdw-switch can not be zero "
2129                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2130             }
2131
2132             /* TODO This code depends on the logic in tables.c that
2133                constructs the table layout, which should be made
2134                explicit in future cleanup. */
2135             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2136                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2137             scale  = fr->dispersionCorrectionTable->scale;
2138             vdwtab = fr->dispersionCorrectionTable->data;
2139
2140             /* Round the cut-offs to exact table values for precision */
2141             ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
2142             ri1  = static_cast<int>(ceil(fr->rvdw*scale));
2143
2144             /* The code below has some support for handling force-switching, i.e.
2145              * when the force (instead of potential) is switched over a limited
2146              * region. This leads to a constant shift in the potential inside the
2147              * switching region, which we can handle by adding a constant energy
2148              * term in the force-switch case just like when we do potential-shift.
2149              *
2150              * For now this is not enabled, but to keep the functionality in the
2151              * code we check separately for switch and shift. When we do force-switch
2152              * the shifting point is rvdw_switch, while it is the cutoff when we
2153              * have a classical potential-shift.
2154              *
2155              * For a pure potential-shift the potential has a constant shift
2156              * all the way out to the cutoff, and that is it. For other forms
2157              * we need to calculate the constant shift up to the point where we
2158              * start modifying the potential.
2159              */
2160             ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2161
2162             r0   = ri0/scale;
2163             rc3  = r0*r0*r0;
2164             rc9  = rc3*rc3*rc3;
2165
2166             if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2167                 (fr->vdwtype == evdwSHIFT))
2168             {
2169                 /* Determine the constant energy shift below rvdw_switch.
2170                  * Table has a scale factor since we have scaled it down to compensate
2171                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2172                  */
2173                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2174                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2175             }
2176             else if (fr->vdw_modifier == eintmodPOTSHIFT)
2177             {
2178                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2179                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2180             }
2181
2182             /* Add the constant part from 0 to rvdw_switch.
2183              * This integration from 0 to rvdw_switch overcounts the number
2184              * of interactions by 1, as it also counts the self interaction.
2185              * We will correct for this later.
2186              */
2187             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2188             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2189
2190             /* Calculate the contribution in the range [r0,r1] where we
2191              * modify the potential. For a pure potential-shift modifier we will
2192              * have ri0==ri1, and there will not be any contribution here.
2193              */
2194             for (i = 0; i < 2; i++)
2195             {
2196                 enersum = 0;
2197                 virsum  = 0;
2198                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2199                 eners[i] -= enersum;
2200                 virs[i]  -= virsum;
2201             }
2202
2203             /* Alright: Above we compensated by REMOVING the parts outside r0
2204              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2205              *
2206              * Regardless of whether r0 is the point where we start switching,
2207              * or the cutoff where we calculated the constant shift, we include
2208              * all the parts we are missing out to infinity from r0 by
2209              * calculating the analytical dispersion correction.
2210              */
2211             eners[0] += -4.0*M_PI/(3.0*rc3);
2212             eners[1] +=  4.0*M_PI/(9.0*rc9);
2213             virs[0]  +=  8.0*M_PI/rc3;
2214             virs[1]  += -16.0*M_PI/(3.0*rc9);
2215         }
2216         else if (fr->vdwtype == evdwCUT ||
2217                  EVDW_PME(fr->vdwtype) ||
2218                  fr->vdwtype == evdwUSER)
2219         {
2220             if (fr->vdwtype == evdwUSER && fplog)
2221             {
2222                 fprintf(fplog,
2223                         "WARNING: using dispersion correction with user tables\n");
2224             }
2225
2226             /* Note that with LJ-PME, the dispersion correction is multiplied
2227              * by the difference between the actual C6 and the value of C6
2228              * that would produce the combination rule.
2229              * This means the normal energy and virial difference formulas
2230              * can be used here.
2231              */
2232
2233             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2234             rc9  = rc3*rc3*rc3;
2235             /* Contribution beyond the cut-off */
2236             eners[0] += -4.0*M_PI/(3.0*rc3);
2237             eners[1] +=  4.0*M_PI/(9.0*rc9);
2238             if (fr->vdw_modifier == eintmodPOTSHIFT)
2239             {
2240                 /* Contribution within the cut-off */
2241                 eners[0] += -4.0*M_PI/(3.0*rc3);
2242                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2243             }
2244             /* Contribution beyond the cut-off */
2245             virs[0]  +=  8.0*M_PI/rc3;
2246             virs[1]  += -16.0*M_PI/(3.0*rc9);
2247         }
2248         else
2249         {
2250             gmx_fatal(FARGS,
2251                       "Dispersion correction is not implemented for vdw-type = %s",
2252                       evdw_names[fr->vdwtype]);
2253         }
2254
2255         /* When we deprecate the group kernels the code below can go too */
2256         if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2257         {
2258             /* Calculate self-interaction coefficient (assuming that
2259              * the reciprocal-space contribution is constant in the
2260              * region that contributes to the self-interaction).
2261              */
2262             fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2263
2264             eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2265             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2266         }
2267
2268         fr->enerdiffsix    = eners[0];
2269         fr->enerdifftwelve = eners[1];
2270         /* The 0.5 is due to the Gromacs definition of the virial */
2271         fr->virdiffsix     = 0.5*virs[0];
2272         fr->virdifftwelve  = 0.5*virs[1];
2273     }
2274 }
2275
2276 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2277                    matrix box, real lambda, tensor pres, tensor virial,
2278                    real *prescorr, real *enercorr, real *dvdlcorr)
2279 {
2280     gmx_bool bCorrAll, bCorrPres;
2281     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2282     int      m;
2283
2284     *prescorr = 0;
2285     *enercorr = 0;
2286     *dvdlcorr = 0;
2287
2288     clear_mat(virial);
2289     clear_mat(pres);
2290
2291     if (ir->eDispCorr != edispcNO)
2292     {
2293         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2294                      ir->eDispCorr == edispcAllEnerPres);
2295         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2296                      ir->eDispCorr == edispcAllEnerPres);
2297
2298         invvol = 1/det(box);
2299         if (fr->n_tpi)
2300         {
2301             /* Only correct for the interactions with the inserted molecule */
2302             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2303             ninter = fr->n_tpi;
2304         }
2305         else
2306         {
2307             dens   = fr->numAtomsForDispersionCorrection*invvol;
2308             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2309         }
2310
2311         if (ir->efep == efepNO)
2312         {
2313             avcsix    = fr->avcsix[0];
2314             avctwelve = fr->avctwelve[0];
2315         }
2316         else
2317         {
2318             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2319             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2320         }
2321
2322         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2323         *enercorr += avcsix*enerdiff;
2324         dvdlambda  = 0.0;
2325         if (ir->efep != efepNO)
2326         {
2327             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2328         }
2329         if (bCorrAll)
2330         {
2331             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2332             *enercorr += avctwelve*enerdiff;
2333             if (fr->efep != efepNO)
2334             {
2335                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2336             }
2337         }
2338
2339         if (bCorrPres)
2340         {
2341             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2342             if (ir->eDispCorr == edispcAllEnerPres)
2343             {
2344                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2345             }
2346             /* The factor 2 is because of the Gromacs virial definition */
2347             spres = -2.0*invvol*svir*PRESFAC;
2348
2349             for (m = 0; m < DIM; m++)
2350             {
2351                 virial[m][m] += svir;
2352                 pres[m][m]   += spres;
2353             }
2354             *prescorr += spres;
2355         }
2356
2357         /* Can't currently control when it prints, for now, just print when degugging */
2358         if (debug)
2359         {
2360             if (bCorrAll)
2361             {
2362                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2363                         avcsix, avctwelve);
2364             }
2365             if (bCorrPres)
2366             {
2367                 fprintf(debug,
2368                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2369                         *enercorr, spres, svir);
2370             }
2371             else
2372             {
2373                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2374             }
2375         }
2376
2377         if (fr->efep != efepNO)
2378         {
2379             *dvdlcorr += dvdlambda;
2380         }
2381     }
2382 }
2383
2384 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2385                             const gmx_mtop_t *mtop, rvec x[],
2386                             gmx_bool bFirst)
2387 {
2388     t_graph        *graph;
2389     int             mb, as, mol;
2390     gmx_molblock_t *molb;
2391
2392     if (bFirst && fplog)
2393     {
2394         fprintf(fplog, "Removing pbc first time\n");
2395     }
2396
2397     snew(graph, 1);
2398     as = 0;
2399     for (mb = 0; mb < mtop->nmolblock; mb++)
2400     {
2401         molb = &mtop->molblock[mb];
2402         if (molb->natoms_mol == 1 ||
2403             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2404         {
2405             /* Just one atom or charge group in the molecule, no PBC required */
2406             as += molb->nmol*molb->natoms_mol;
2407         }
2408         else
2409         {
2410             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2411             mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2412                            0, molb->natoms_mol, FALSE, FALSE, graph);
2413
2414             for (mol = 0; mol < molb->nmol; mol++)
2415             {
2416                 mk_mshift(fplog, graph, ePBC, box, x+as);
2417
2418                 shift_self(graph, box, x+as);
2419                 /* The molecule is whole now.
2420                  * We don't need the second mk_mshift call as in do_pbc_first,
2421                  * since we no longer need this graph.
2422                  */
2423
2424                 as += molb->natoms_mol;
2425             }
2426             done_graph(graph);
2427         }
2428     }
2429     sfree(graph);
2430 }
2431
2432 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2433                        const gmx_mtop_t *mtop, rvec x[])
2434 {
2435     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2436 }
2437
2438 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2439                  gmx_mtop_t *mtop, rvec x[])
2440 {
2441     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2442 }
2443
2444 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2445 {
2446     int t, nth;
2447     nth = gmx_omp_nthreads_get(emntDefault);
2448
2449 #pragma omp parallel for num_threads(nth) schedule(static)
2450     for (t = 0; t < nth; t++)
2451     {
2452         try
2453         {
2454             int offset, len;
2455
2456             offset = (natoms*t    )/nth;
2457             len    = (natoms*(t + 1))/nth - offset;
2458             put_atoms_in_box(ePBC, box, len, x + offset);
2459         }
2460         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2461     }
2462 }
2463
2464 // TODO This can be cleaned up a lot, and move back to runner.cpp
2465 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2466                 const t_inputrec *inputrec,
2467                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2468                 gmx_walltime_accounting_t walltime_accounting,
2469                 nonbonded_verlet_t *nbv,
2470                 gmx_bool bWriteStat)
2471 {
2472     t_nrnb *nrnb_tot = nullptr;
2473     double  delta_t  = 0;
2474     double  nbfs     = 0, mflop = 0;
2475     double  elapsed_time,
2476             elapsed_time_over_all_ranks,
2477             elapsed_time_over_all_threads,
2478             elapsed_time_over_all_threads_over_all_ranks;
2479     /* Control whether it is valid to print a report. Only the
2480        simulation master may print, but it should not do so if the run
2481        terminated e.g. before a scheduled reset step. This is
2482        complicated by the fact that PME ranks are unaware of the
2483        reason why they were sent a pmerecvqxFINISH. To avoid
2484        communication deadlocks, we always do the communication for the
2485        report, even if we've decided not to write the report, because
2486        how long it takes to finish the run is not important when we've
2487        decided not to report on the simulation performance. */
2488     bool    printReport = SIMMASTER(cr);
2489
2490     if (!walltime_accounting_get_valid_finish(walltime_accounting))
2491     {
2492         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2493         printReport = false;
2494     }
2495
2496     if (cr->nnodes > 1)
2497     {
2498         snew(nrnb_tot, 1);
2499 #if GMX_MPI
2500         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2501                       cr->mpi_comm_mysim);
2502 #endif
2503     }
2504     else
2505     {
2506         nrnb_tot = nrnb;
2507     }
2508
2509     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2510     elapsed_time_over_all_ranks                  = elapsed_time;
2511     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2512     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2513 #if GMX_MPI
2514     if (cr->nnodes > 1)
2515     {
2516         /* reduce elapsed_time over all MPI ranks in the current simulation */
2517         MPI_Allreduce(&elapsed_time,
2518                       &elapsed_time_over_all_ranks,
2519                       1, MPI_DOUBLE, MPI_SUM,
2520                       cr->mpi_comm_mysim);
2521         elapsed_time_over_all_ranks /= cr->nnodes;
2522         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2523          * current simulation. */
2524         MPI_Allreduce(&elapsed_time_over_all_threads,
2525                       &elapsed_time_over_all_threads_over_all_ranks,
2526                       1, MPI_DOUBLE, MPI_SUM,
2527                       cr->mpi_comm_mysim);
2528     }
2529 #endif
2530
2531     if (printReport)
2532     {
2533         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2534     }
2535     if (cr->nnodes > 1)
2536     {
2537         sfree(nrnb_tot);
2538     }
2539
2540     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2541     {
2542         print_dd_statistics(cr, inputrec, fplog);
2543     }
2544
2545     /* TODO Move the responsibility for any scaling by thread counts
2546      * to the code that handled the thread region, so that there's a
2547      * mechanism to keep cycle counting working during the transition
2548      * to task parallelism. */
2549     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2550     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2551     wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2552     auto cycle_sum(wallcycle_sum(cr, wcycle));
2553
2554     if (printReport)
2555     {
2556         struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2557
2558         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2559                         elapsed_time_over_all_ranks,
2560                         wcycle, cycle_sum, gputimes);
2561
2562         if (EI_DYNAMICS(inputrec->eI))
2563         {
2564             delta_t = inputrec->delta_t;
2565         }
2566
2567         if (fplog)
2568         {
2569             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2570                        elapsed_time_over_all_ranks,
2571                        walltime_accounting_get_nsteps_done(walltime_accounting),
2572                        delta_t, nbfs, mflop);
2573         }
2574         if (bWriteStat)
2575         {
2576             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2577                        elapsed_time_over_all_ranks,
2578                        walltime_accounting_get_nsteps_done(walltime_accounting),
2579                        delta_t, nbfs, mflop);
2580         }
2581     }
2582 }
2583
2584 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2585 {
2586     /* this function works, but could probably use a logic rewrite to keep all the different
2587        types of efep straight. */
2588
2589     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2590     {
2591         return;
2592     }
2593
2594     t_lambda *fep = ir->fepvals;
2595     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2596                                             if checkpoint is set -- a kludge is in for now
2597                                             to prevent this.*/
2598
2599     for (int i = 0; i < efptNR; i++)
2600     {
2601         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2602         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2603         {
2604             lambda[i] = fep->init_lambda;
2605             if (lam0)
2606             {
2607                 lam0[i] = lambda[i];
2608             }
2609         }
2610         else
2611         {
2612             lambda[i] = fep->all_lambda[i][*fep_state];
2613             if (lam0)
2614             {
2615                 lam0[i] = lambda[i];
2616             }
2617         }
2618     }
2619     if (ir->bSimTemp)
2620     {
2621         /* need to rescale control temperatures to match current state */
2622         for (int i = 0; i < ir->opts.ngtc; i++)
2623         {
2624             if (ir->opts.ref_t[i] > 0)
2625             {
2626                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2627             }
2628         }
2629     }
2630
2631     /* Send to the log the information on the current lambdas */
2632     if (fplog != nullptr)
2633     {
2634         fprintf(fplog, "Initial vector of lambda components:[ ");
2635         for (const auto &l : lambda)
2636         {
2637             fprintf(fplog, "%10.4f ", l);
2638         }
2639         fprintf(fplog, "]\n");
2640     }
2641     return;
2642 }
2643
2644
2645 void init_md(FILE *fplog,
2646              t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2647              t_inputrec *ir, const gmx_output_env_t *oenv,
2648              double *t, double *t0,
2649              gmx::ArrayRef<real> lambda, int *fep_state, double *lam0,
2650              t_nrnb *nrnb, gmx_mtop_t *mtop,
2651              gmx_update_t **upd,
2652              int nfile, const t_filenm fnm[],
2653              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2654              tensor force_vir, tensor shake_vir, rvec mu_tot,
2655              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2656              gmx_wallcycle_t wcycle)
2657 {
2658     int  i;
2659
2660     /* Initial values */
2661     *t = *t0       = ir->init_t;
2662
2663     *bSimAnn = FALSE;
2664     for (i = 0; i < ir->opts.ngtc; i++)
2665     {
2666         /* set bSimAnn if any group is being annealed */
2667         if (ir->opts.annealing[i] != eannNO)
2668         {
2669             *bSimAnn = TRUE;
2670         }
2671     }
2672
2673     /* Initialize lambda variables */
2674     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2675
2676     // TODO upd is never NULL in practice, but the analysers don't know that
2677     if (upd)
2678     {
2679         *upd = init_update(ir);
2680     }
2681     if (*bSimAnn)
2682     {
2683         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2684     }
2685
2686     if (vcm != nullptr)
2687     {
2688         *vcm = init_vcm(fplog, &mtop->groups, ir);
2689     }
2690
2691     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2692     {
2693         if (ir->etc == etcBERENDSEN)
2694         {
2695             please_cite(fplog, "Berendsen84a");
2696         }
2697         if (ir->etc == etcVRESCALE)
2698         {
2699             please_cite(fplog, "Bussi2007a");
2700         }
2701         if (ir->eI == eiSD1)
2702         {
2703             please_cite(fplog, "Goga2012");
2704         }
2705     }
2706     init_nrnb(nrnb);
2707
2708     if (nfile != -1)
2709     {
2710         *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, outputProvider, ir, mtop, oenv, wcycle);
2711
2712         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? nullptr : mdoutf_get_fp_ene(*outf),
2713                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2714     }
2715
2716     /* Initiate variables */
2717     clear_mat(force_vir);
2718     clear_mat(shake_vir);
2719     clear_rvec(mu_tot);
2720 }