PME GPU/CUDA data framework.
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <assert.h>
44 #include <math.h>
45 #include <stdio.h>
46 #include <string.h>
47
48 #include <cstdint>
49
50 #include <array>
51
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
112
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
115 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
116
117 void print_time(FILE                     *out,
118                 gmx_walltime_accounting_t walltime_accounting,
119                 gmx_int64_t               step,
120                 t_inputrec               *ir,
121                 t_commrec gmx_unused     *cr)
122 {
123     time_t finish;
124     char   timebuf[STRLEN];
125     double dt, elapsed_seconds, time_per_step;
126     char   buf[48];
127
128 #if !GMX_THREAD_MPI
129     if (!PAR(cr))
130 #endif
131     {
132         fprintf(out, "\r");
133     }
134     fprintf(out, "step %s", gmx_step_str(step, buf));
135     fflush(out);
136
137     if ((step >= ir->nstlist))
138     {
139         double seconds_since_epoch = gmx_gettime();
140         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
141         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
142         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
143
144         if (ir->nsteps >= 0)
145         {
146             if (dt >= 300)
147             {
148                 finish = (time_t) (seconds_since_epoch + dt);
149                 gmx_ctime_r(&finish, timebuf, STRLEN);
150                 sprintf(buf, "%s", timebuf);
151                 buf[strlen(buf)-1] = '\0';
152                 fprintf(out, ", will finish %s", buf);
153             }
154             else
155             {
156                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
157             }
158         }
159         else
160         {
161             fprintf(out, " performance: %.1f ns/day    ",
162                     ir->delta_t/1000*24*60*60/time_per_step);
163         }
164     }
165 #if !GMX_THREAD_MPI
166     if (PAR(cr))
167     {
168         fprintf(out, "\n");
169     }
170 #endif
171
172     fflush(out);
173 }
174
175 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
176                          double the_time)
177 {
178     char   time_string[STRLEN];
179
180     if (!fplog)
181     {
182         return;
183     }
184
185     {
186         int    i;
187         char   timebuf[STRLEN];
188         time_t temp_time = (time_t) the_time;
189
190         gmx_ctime_r(&temp_time, timebuf, STRLEN);
191         for (i = 0; timebuf[i] >= ' '; i++)
192         {
193             time_string[i] = timebuf[i];
194         }
195         time_string[i] = '\0';
196     }
197
198     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
199 }
200
201 void print_start(FILE *fplog, t_commrec *cr,
202                  gmx_walltime_accounting_t walltime_accounting,
203                  const char *name)
204 {
205     char buf[STRLEN];
206
207     sprintf(buf, "Started %s", name);
208     print_date_and_time(fplog, cr->nodeid, buf,
209                         walltime_accounting_get_start_time_stamp(walltime_accounting));
210 }
211
212 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
213 {
214     /* TODO: remove this - 1 when padding is properly implemented */
215     int         end  = forceToAdd->size() - 1;
216     const rvec *fAdd = as_rvec_array(forceToAdd->data());
217
218     // cppcheck-suppress unreadVariable
219     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
220 #pragma omp parallel for num_threads(nt) schedule(static)
221     for (int i = 0; i < end; i++)
222     {
223         rvec_inc(f[i], fAdd[i]);
224     }
225 }
226
227 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
228                         tensor vir_part, t_graph *graph, matrix box,
229                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
230 {
231     int    i;
232
233     /* The short-range virial from surrounding boxes */
234     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
235     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
236
237     /* Calculate partial virial, for local atoms only, based on short range.
238      * Total virial is computed in global_stat, called from do_md
239      */
240     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
241     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
242
243     /* Add position restraint contribution */
244     for (i = 0; i < DIM; i++)
245     {
246         vir_part[i][i] += fr->vir_diag_posres[i];
247     }
248
249     /* Add wall contribution */
250     for (i = 0; i < DIM; i++)
251     {
252         vir_part[i][ZZ] += fr->vir_wall_z[i];
253     }
254
255     if (debug)
256     {
257         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
258     }
259 }
260
261 static void pull_potential_wrapper(t_commrec *cr,
262                                    t_inputrec *ir,
263                                    matrix box, rvec x[],
264                                    rvec f[],
265                                    tensor vir_force,
266                                    t_mdatoms *mdatoms,
267                                    gmx_enerdata_t *enerd,
268                                    real *lambda,
269                                    double t,
270                                    gmx_wallcycle_t wcycle)
271 {
272     t_pbc  pbc;
273     real   dvdl;
274
275     /* Calculate the center of mass forces, this requires communication,
276      * which is why pull_potential is called close to other communication.
277      * The virial contribution is calculated directly,
278      * which is why we call pull_potential after calc_virial.
279      */
280     wallcycle_start(wcycle, ewcPULLPOT);
281     set_pbc(&pbc, ir->ePBC, box);
282     dvdl                     = 0;
283     enerd->term[F_COM_PULL] +=
284         pull_potential(ir->pull_work, mdatoms, &pbc,
285                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
286     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
287     wallcycle_stop(wcycle, ewcPULLPOT);
288 }
289
290 static void pme_receive_force_ener(t_commrec      *cr,
291                                    gmx_wallcycle_t wcycle,
292                                    gmx_enerdata_t *enerd,
293                                    t_forcerec     *fr)
294 {
295     real   e_q, e_lj, dvdl_q, dvdl_lj;
296     float  cycles_ppdpme, cycles_seppme;
297
298     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
299     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
300
301     /* In case of node-splitting, the PP nodes receive the long-range
302      * forces, virial and energy from the PME nodes here.
303      */
304     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
305     dvdl_q  = 0;
306     dvdl_lj = 0;
307     gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
308                       fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
309                       &cycles_seppme);
310     enerd->term[F_COUL_RECIP] += e_q;
311     enerd->term[F_LJ_RECIP]   += e_lj;
312     enerd->dvdl_lin[efptCOUL] += dvdl_q;
313     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
314
315     if (wcycle)
316     {
317         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
318     }
319     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
320 }
321
322 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
323                                gmx_int64_t step, real forceTolerance,
324                                const rvec *x, const rvec *f)
325 {
326     real           force2Tolerance = gmx::square(forceTolerance);
327     std::uintmax_t numNonFinite    = 0;
328     for (int i = 0; i < md->homenr; i++)
329     {
330         real force2    = norm2(f[i]);
331         bool nonFinite = !std::isfinite(force2);
332         if (force2 >= force2Tolerance || nonFinite)
333         {
334             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
335                     step,
336                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
337         }
338         if (nonFinite)
339         {
340             numNonFinite++;
341         }
342     }
343     if (numNonFinite > 0)
344     {
345         /* Note that with MPI this fatal call on one rank might interrupt
346          * the printing on other ranks. But we can only avoid that with
347          * an expensive MPI barrier that we would need at each step.
348          */
349         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
350     }
351 }
352
353 static void post_process_forces(t_commrec *cr,
354                                 gmx_int64_t step,
355                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
356                                 gmx_localtop_t *top,
357                                 matrix box, rvec x[],
358                                 rvec f[],
359                                 tensor vir_force,
360                                 t_mdatoms *mdatoms,
361                                 t_graph *graph,
362                                 t_forcerec *fr, gmx_vsite_t *vsite,
363                                 int flags)
364 {
365     if (fr->bF_NoVirSum)
366     {
367         if (vsite)
368         {
369             /* Spread the mesh force on virtual sites to the other particles...
370              * This is parallellized. MPI communication is performed
371              * if the constructing atoms aren't local.
372              */
373             wallcycle_start(wcycle, ewcVSITESPREAD);
374             spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
375                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
376                            nrnb,
377                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
378             wallcycle_stop(wcycle, ewcVSITESPREAD);
379         }
380         if (flags & GMX_FORCE_VIRIAL)
381         {
382             /* Now add the forces, this is local */
383             sum_forces(f, fr->f_novirsum);
384
385             if (EEL_FULL(fr->ic->eeltype))
386             {
387                 /* Add the mesh contribution to the virial */
388                 m_add(vir_force, fr->vir_el_recip, vir_force);
389             }
390             if (EVDW_PME(fr->ic->vdwtype))
391             {
392                 /* Add the mesh contribution to the virial */
393                 m_add(vir_force, fr->vir_lj_recip, vir_force);
394             }
395             if (debug)
396             {
397                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
398             }
399         }
400     }
401
402     if (fr->print_force >= 0)
403     {
404         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
405     }
406 }
407
408 static void do_nb_verlet(t_forcerec *fr,
409                          interaction_const_t *ic,
410                          gmx_enerdata_t *enerd,
411                          int flags, int ilocality,
412                          int clearF,
413                          gmx_int64_t step,
414                          t_nrnb *nrnb,
415                          gmx_wallcycle_t wcycle)
416 {
417     if (!(flags & GMX_FORCE_NONBONDED))
418     {
419         /* skip non-bonded calculation */
420         return;
421     }
422
423     nonbonded_verlet_t       *nbv  = fr->nbv;
424     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
425
426     /* GPU kernel launch overhead is already timed separately */
427     if (fr->cutoff_scheme != ecutsVERLET)
428     {
429         gmx_incons("Invalid cut-off scheme passed!");
430     }
431
432     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
433
434     if (!bUsingGpuKernels)
435     {
436         /* When dynamic pair-list  pruning is requested, we need to prune
437          * at nstlistPrune steps.
438          */
439         if (nbv->listParams->useDynamicPruning &&
440             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
441         {
442             /* Prune the pair-list beyond fr->ic->rlistPrune using
443              * the current coordinates of the atoms.
444              */
445             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
446             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
447             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
448         }
449
450         wallcycle_sub_start(wcycle, ewcsNONBONDED);
451     }
452
453     switch (nbvg->kernel_type)
454     {
455         case nbnxnk4x4_PlainC:
456         case nbnxnk4xN_SIMD_4xN:
457         case nbnxnk4xN_SIMD_2xNN:
458             nbnxn_kernel_cpu(nbvg,
459                              nbv->nbat,
460                              ic,
461                              fr->shift_vec,
462                              flags,
463                              clearF,
464                              fr->fshift[0],
465                              enerd->grpp.ener[egCOULSR],
466                              fr->bBHAM ?
467                              enerd->grpp.ener[egBHAMSR] :
468                              enerd->grpp.ener[egLJSR]);
469             break;
470
471         case nbnxnk8x8x8_GPU:
472             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
473             break;
474
475         case nbnxnk8x8x8_PlainC:
476             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
477                                  nbv->nbat, ic,
478                                  fr->shift_vec,
479                                  flags,
480                                  clearF,
481                                  nbv->nbat->out[0].f,
482                                  fr->fshift[0],
483                                  enerd->grpp.ener[egCOULSR],
484                                  fr->bBHAM ?
485                                  enerd->grpp.ener[egBHAMSR] :
486                                  enerd->grpp.ener[egLJSR]);
487             break;
488
489         default:
490             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
491
492     }
493     if (!bUsingGpuKernels)
494     {
495         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
496     }
497
498     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
499     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
500     {
501         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
502     }
503     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
504              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
505     {
506         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
507     }
508     else
509     {
510         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
511     }
512     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
513     if (flags & GMX_FORCE_ENERGY)
514     {
515         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
516         enr_nbnxn_kernel_ljc += 1;
517         enr_nbnxn_kernel_lj  += 1;
518     }
519
520     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
521              nbvg->nbl_lists.natpair_ljq);
522     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
523              nbvg->nbl_lists.natpair_lj);
524     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
525     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
526              nbvg->nbl_lists.natpair_q);
527
528     if (ic->vdw_modifier == eintmodFORCESWITCH)
529     {
530         /* We add up the switch cost separately */
531         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
532                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
533     }
534     if (ic->vdw_modifier == eintmodPOTSWITCH)
535     {
536         /* We add up the switch cost separately */
537         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
538                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
539     }
540     if (ic->vdwtype == evdwPME)
541     {
542         /* We add up the LJ Ewald cost separately */
543         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
544                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
545     }
546 }
547
548 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
549                              t_forcerec           *fr,
550                              rvec                  x[],
551                              rvec                  f[],
552                              t_mdatoms            *mdatoms,
553                              t_lambda             *fepvals,
554                              real                 *lambda,
555                              gmx_enerdata_t       *enerd,
556                              int                   flags,
557                              t_nrnb               *nrnb,
558                              gmx_wallcycle_t       wcycle)
559 {
560     int              donb_flags;
561     nb_kernel_data_t kernel_data;
562     real             lam_i[efptNR];
563     real             dvdl_nb[efptNR];
564     int              th;
565     int              i, j;
566
567     donb_flags = 0;
568     /* Add short-range interactions */
569     donb_flags |= GMX_NONBONDED_DO_SR;
570
571     /* Currently all group scheme kernels always calculate (shift-)forces */
572     if (flags & GMX_FORCE_FORCES)
573     {
574         donb_flags |= GMX_NONBONDED_DO_FORCE;
575     }
576     if (flags & GMX_FORCE_VIRIAL)
577     {
578         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
579     }
580     if (flags & GMX_FORCE_ENERGY)
581     {
582         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
583     }
584
585     kernel_data.flags  = donb_flags;
586     kernel_data.lambda = lambda;
587     kernel_data.dvdl   = dvdl_nb;
588
589     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
590     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
591
592     /* reset free energy components */
593     for (i = 0; i < efptNR; i++)
594     {
595         dvdl_nb[i]  = 0;
596     }
597
598     assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
599
600     wallcycle_sub_start(wcycle, ewcsNONBONDED);
601 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
602     for (th = 0; th < nbl_lists->nnbl; th++)
603     {
604         try
605         {
606             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
607                                       x, f, fr, mdatoms, &kernel_data, nrnb);
608         }
609         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
610     }
611
612     if (fepvals->sc_alpha != 0)
613     {
614         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
615         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
616     }
617     else
618     {
619         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
620         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
621     }
622
623     /* If we do foreign lambda and we have soft-core interactions
624      * we have to recalculate the (non-linear) energies contributions.
625      */
626     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
627     {
628         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
629         kernel_data.lambda         = lam_i;
630         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
631         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
632         /* Note that we add to kernel_data.dvdl, but ignore the result */
633
634         for (i = 0; i < enerd->n_lambda; i++)
635         {
636             for (j = 0; j < efptNR; j++)
637             {
638                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
639             }
640             reset_foreign_enerdata(enerd);
641 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
642             for (th = 0; th < nbl_lists->nnbl; th++)
643             {
644                 try
645                 {
646                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
647                                               x, f, fr, mdatoms, &kernel_data, nrnb);
648                 }
649                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
650             }
651
652             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
653             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
654         }
655     }
656
657     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
658 }
659
660 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
661 {
662     return nbv != nullptr && nbv->bUseGPU;
663 }
664
665 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
666 {
667     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
668
669     /* Note that we would like to avoid this conditional by putting it
670      * into the omp pragma instead, but then we still take the full
671      * omp parallel for overhead (at least with gcc5).
672      */
673     if (nth == 1)
674     {
675         for (int i = 0; i < n; i++)
676         {
677             clear_rvec(v[i]);
678         }
679     }
680     else
681     {
682 #pragma omp parallel for num_threads(nth) schedule(static)
683         for (int i = 0; i < n; i++)
684         {
685             clear_rvec(v[i]);
686         }
687     }
688 }
689
690 /*! \brief  This routine checks if the potential energy is finite.
691  *
692  * Note that passing this check does not guarantee finite forces,
693  * since those use slightly different arithmetics. But in most cases
694  * there is just a narrow coordinate range where forces are not finite
695  * and energies are finite.
696  *
697  * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
698  */
699 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
700 {
701     if (!std::isfinite(enerd->term[F_EPOT]))
702     {
703         gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
704                   enerd->term[F_EPOT],
705                   enerd->term[F_LJ],
706                   enerd->term[F_COUL_SR]);
707     }
708 }
709
710 /*! \brief Compute forces and/or energies for special algorithms
711  *
712  * The intention is to collect all calls to algorithms that compute
713  * forces on local atoms only and that do not contribute to the local
714  * virial sum (but add their virial contribution separately).
715  * Eventually these should likely all become ForceProviders.
716  * Within this function the intention is to have algorithms that do
717  * global communication at the end, so global barriers within the MD loop
718  * are as close together as possible.
719  *
720  * \param[in]     cr              The communication record
721  * \param[in]     inputrec        The input record
722  * \param[in]     step            The current MD step
723  * \param[in]     t               The current time
724  * \param[in,out] wcycle          Wallcycle accounting struct
725  * \param[in,out] forceProviders  Pointer to a list of force providers
726  * \param[in]     box             The unit cell
727  * \param[in]     x               The coordinates
728  * \param[in]     mdatoms         Per atom properties
729  * \param[in]     lambda          Array of free-energy lambda values
730  * \param[in]     forceFlags      Flags that tell whether we should compute forces/energies/virial
731  * \param[in,out] force           Force buffer; does not contribute to the virial
732  * \param[in,out] virial          Virial buffer
733  * \param[in,out] enerd           Energy buffer
734  * \param[in,out] ed              Essential dynamics pointer
735  * \param[in]     bNS             Tells if we did neighbor searching this step, used for ED sampling
736  *
737  * \todo Remove bNS, which is used incorrectly.
738  * \todo Convert all other algorithms called here to ForceProviders.
739  */
740 static void
741 computeSpecialForces(t_commrec        *cr,
742                      t_inputrec       *inputrec,
743                      gmx_int64_t       step,
744                      double            t,
745                      gmx_wallcycle_t   wcycle,
746                      ForceProviders   *forceProviders,
747                      matrix            box,
748                      rvec             *x,
749                      t_mdatoms        *mdatoms,
750                      real             *lambda,
751                      int               forceFlags,
752                      PaddedRVecVector *force,
753                      tensor            virial,
754                      gmx_enerdata_t   *enerd,
755                      gmx_edsam_t       ed,
756                      gmx_bool          bNS)
757 {
758     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
759
760     /* NOTE: Currently all ForceProviders only provide forces.
761      *       When they also provide energies, remove this conditional.
762      */
763     if (computeForces)
764     {
765         /* Collect forces from modules */
766         gmx::ArrayRef<gmx::RVec> f = *force;
767
768         forceProviders->calculateForces(cr, mdatoms, box, t, x, f);
769     }
770
771     rvec *f = as_rvec_array(force->data());
772
773     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
774     {
775         pull_potential_wrapper(cr, inputrec, box, x,
776                                f, virial, mdatoms, enerd, lambda, t,
777                                wcycle);
778     }
779
780     /* Add the forces from enforced rotation potentials (if any) */
781     if (inputrec->bRot)
782     {
783         wallcycle_start(wcycle, ewcROTadd);
784         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
785         wallcycle_stop(wcycle, ewcROTadd);
786     }
787
788     if (ed)
789     {
790         /* Note that since init_edsam() is called after the initialization
791          * of forcerec, edsam doesn't request the noVirSum force buffer.
792          * Thus if no other algorithm (e.g. PME) requires it, the forces
793          * here will contribute to the virial.
794          */
795         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
796     }
797
798     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
799     if (inputrec->bIMD && computeForces)
800     {
801         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
802     }
803 }
804
805 static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
806                                 t_inputrec *inputrec,
807                                 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
808                                 gmx_localtop_t *top,
809                                 gmx_groups_t gmx_unused *groups,
810                                 matrix box, rvec x[], history_t *hist,
811                                 PaddedRVecVector *force,
812                                 tensor vir_force,
813                                 t_mdatoms *mdatoms,
814                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
815                                 real *lambda, t_graph *graph,
816                                 t_forcerec *fr, interaction_const_t *ic,
817                                 gmx_vsite_t *vsite, rvec mu_tot,
818                                 double t, gmx_edsam_t ed,
819                                 gmx_bool bBornRadii,
820                                 int flags,
821                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
822                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
823 {
824     int                 cg1, i, j;
825     double              mu[2*DIM];
826     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
827     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
828     rvec                vzero, box_diag;
829     float               cycles_pme, cycles_wait_gpu;
830     nonbonded_verlet_t *nbv = fr->nbv;
831
832     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
833     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
834     bFillGrid     = (bNS && bStateChanged);
835     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
836     bDoForces     = (flags & GMX_FORCE_FORCES);
837     bUseGPU       = fr->nbv->bUseGPU;
838     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
839
840     /* At a search step we need to start the first balancing region
841      * somewhere early inside the step after communication during domain
842      * decomposition (and not during the previous step as usual).
843      */
844     if (bNS &&
845         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
846     {
847         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
848     }
849
850     cycles_wait_gpu = 0;
851
852     const int start  = 0;
853     const int homenr = mdatoms->homenr;
854
855     clear_mat(vir_force);
856
857     if (DOMAINDECOMP(cr))
858     {
859         cg1 = cr->dd->ncg_tot;
860     }
861     else
862     {
863         cg1 = top->cgs.nr;
864     }
865     if (fr->n_tpi > 0)
866     {
867         cg1--;
868     }
869
870     if (bStateChanged)
871     {
872         update_forcerec(fr, box);
873
874         if (inputrecNeedMutot(inputrec))
875         {
876             /* Calculate total (local) dipole moment in a temporary common array.
877              * This makes it possible to sum them over nodes faster.
878              */
879             calc_mu(start, homenr,
880                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
881                     mu, mu+DIM);
882         }
883     }
884
885     if (fr->ePBC != epbcNONE)
886     {
887         /* Compute shift vectors every step,
888          * because of pressure coupling or box deformation!
889          */
890         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
891         {
892             calc_shifts(box, fr->shift_vec);
893         }
894
895         if (bCalcCGCM)
896         {
897             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
898             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
899         }
900         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
901         {
902             unshift_self(graph, box, x);
903         }
904     }
905
906     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
907                                  fr->shift_vec, nbv->nbat);
908
909 #if GMX_MPI
910     if (!(cr->duty & DUTY_PME))
911     {
912         /* Send particle coordinates to the pme nodes.
913          * Since this is only implemented for domain decomposition
914          * and domain decomposition does not use the graph,
915          * we do not need to worry about shifting.
916          */
917         wallcycle_start(wcycle, ewcPP_PMESENDX);
918         gmx_pme_send_coordinates(cr, box, x,
919                                  lambda[efptCOUL], lambda[efptVDW],
920                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
921                                  step);
922         wallcycle_stop(wcycle, ewcPP_PMESENDX);
923     }
924 #endif /* GMX_MPI */
925
926     /* do gridding for pair search */
927     if (bNS)
928     {
929         if (graph && bStateChanged)
930         {
931             /* Calculate intramolecular shift vectors to make molecules whole */
932             mk_mshift(fplog, graph, fr->ePBC, box, x);
933         }
934
935         clear_rvec(vzero);
936         box_diag[XX] = box[XX][XX];
937         box_diag[YY] = box[YY][YY];
938         box_diag[ZZ] = box[ZZ][ZZ];
939
940         wallcycle_start(wcycle, ewcNS);
941         if (!fr->bDomDec)
942         {
943             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
944             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
945                               0, vzero, box_diag,
946                               0, mdatoms->homenr, -1, fr->cginfo, x,
947                               0, nullptr,
948                               nbv->grp[eintLocal].kernel_type,
949                               nbv->nbat);
950             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
951         }
952         else
953         {
954             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
955             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
956                                        fr->cginfo, x,
957                                        nbv->grp[eintNonlocal].kernel_type,
958                                        nbv->nbat);
959             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
960         }
961
962         nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
963         wallcycle_stop(wcycle, ewcNS);
964     }
965
966     /* initialize the GPU atom data and copy shift vector */
967     if (bUseGPU)
968     {
969         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
970         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
971
972         if (bNS)
973         {
974             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
975         }
976
977         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
978
979         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
980         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
981     }
982
983     /* do local pair search */
984     if (bNS)
985     {
986         wallcycle_start_nocount(wcycle, ewcNS);
987         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
988         nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
989                             &top->excls,
990                             nbv->listParams->rlistOuter,
991                             nbv->min_ci_balanced,
992                             &nbv->grp[eintLocal].nbl_lists,
993                             eintLocal,
994                             nbv->grp[eintLocal].kernel_type,
995                             nrnb);
996         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
997         if (nbv->listParams->useDynamicPruning && !bUseGPU)
998         {
999             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1000         }
1001         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1002
1003         if (bUseGPU)
1004         {
1005             /* initialize local pair-list on the GPU */
1006             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1007                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1008                                     eintLocal);
1009         }
1010         wallcycle_stop(wcycle, ewcNS);
1011     }
1012     else
1013     {
1014         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1015         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1016         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
1017                                         nbv->nbat);
1018         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1019         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1020     }
1021
1022     if (bUseGPU)
1023     {
1024         if (DOMAINDECOMP(cr))
1025         {
1026             ddOpenBalanceRegionGpu(cr->dd);
1027         }
1028
1029         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1030         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1031         /* launch local nonbonded work on GPU */
1032         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1033                      step, nrnb, wcycle);
1034         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1035         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1036     }
1037
1038     /* Communicate coordinates and sum dipole if necessary +
1039        do non-local pair search */
1040     if (DOMAINDECOMP(cr))
1041     {
1042         if (bNS)
1043         {
1044             wallcycle_start_nocount(wcycle, ewcNS);
1045             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1046
1047             nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1048                                 &top->excls,
1049                                 nbv->listParams->rlistOuter,
1050                                 nbv->min_ci_balanced,
1051                                 &nbv->grp[eintNonlocal].nbl_lists,
1052                                 eintNonlocal,
1053                                 nbv->grp[eintNonlocal].kernel_type,
1054                                 nrnb);
1055             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1056             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1057             {
1058                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1059             }
1060             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1061
1062             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1063             {
1064                 /* initialize non-local pair-list on the GPU */
1065                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1066                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1067                                         eintNonlocal);
1068             }
1069             wallcycle_stop(wcycle, ewcNS);
1070         }
1071         else
1072         {
1073             wallcycle_start(wcycle, ewcMOVEX);
1074             dd_move_x(cr->dd, box, x);
1075             wallcycle_stop(wcycle, ewcMOVEX);
1076
1077             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1078             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1079             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1080                                             nbv->nbat);
1081             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1082             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1083         }
1084
1085         if (bUseGPU)
1086         {
1087             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1088             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1089             /* launch non-local nonbonded tasks on GPU */
1090             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1091                          step, nrnb, wcycle);
1092             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1093             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1094         }
1095     }
1096
1097     if (bUseGPU)
1098     {
1099         /* launch D2H copy-back F */
1100         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1101         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1102         if (DOMAINDECOMP(cr))
1103         {
1104             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1105                                      flags, eatNonlocal);
1106         }
1107         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1108                                  flags, eatLocal);
1109         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1110         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1111     }
1112
1113     if (bStateChanged && inputrecNeedMutot(inputrec))
1114     {
1115         if (PAR(cr))
1116         {
1117             gmx_sumd(2*DIM, mu, cr);
1118             ddReopenBalanceRegionCpu(cr->dd);
1119         }
1120
1121         for (i = 0; i < 2; i++)
1122         {
1123             for (j = 0; j < DIM; j++)
1124             {
1125                 fr->mu_tot[i][j] = mu[i*DIM + j];
1126             }
1127         }
1128     }
1129     if (fr->efep == efepNO)
1130     {
1131         copy_rvec(fr->mu_tot[0], mu_tot);
1132     }
1133     else
1134     {
1135         for (j = 0; j < DIM; j++)
1136         {
1137             mu_tot[j] =
1138                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1139                 lambda[efptCOUL]*fr->mu_tot[1][j];
1140         }
1141     }
1142
1143     /* Reset energies */
1144     reset_enerdata(enerd);
1145     clear_rvecs(SHIFTS, fr->fshift);
1146
1147     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1148     {
1149         wallcycle_start(wcycle, ewcPPDURINGPME);
1150         dd_force_flop_start(cr->dd, nrnb);
1151     }
1152
1153     if (inputrec->bRot)
1154     {
1155         wallcycle_start(wcycle, ewcROT);
1156         do_rotation(cr, inputrec, box, x, t, step, bNS);
1157         wallcycle_stop(wcycle, ewcROT);
1158     }
1159
1160     /* Temporary solution until all routines take PaddedRVecVector */
1161     rvec *f = as_rvec_array(force->data());
1162
1163     /* Start the force cycle counter.
1164      * Note that a different counter is used for dynamic load balancing.
1165      */
1166     wallcycle_start(wcycle, ewcFORCE);
1167     fr->f_novirsum = force;
1168     if (bDoForces)
1169     {
1170         /* If we need to compute the virial, we might need a separate
1171          * force buffer for algorithms for which the virial is calculated
1172          * separately, such as PME.
1173          */
1174         if (fr->bF_NoVirSum && (flags & GMX_FORCE_VIRIAL))
1175         {
1176             fr->f_novirsum = fr->forceBufferNoVirialSummation;
1177
1178             /* TODO: remove this - 1 when padding is properly implemented */
1179             clear_rvecs_omp(fr->f_novirsum->size() - 1,
1180                             as_rvec_array(fr->f_novirsum->data()));
1181         }
1182         /* Clear the short- and long-range forces */
1183         clear_rvecs_omp(fr->natoms_force_constr, f);
1184
1185         clear_rvec(fr->vir_diag_posres);
1186     }
1187
1188     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1189     {
1190         clear_pull_forces(inputrec->pull_work);
1191     }
1192
1193     /* We calculate the non-bonded forces, when done on the CPU, here.
1194      * We do this before calling do_force_lowlevel, because in that
1195      * function, the listed forces are calculated before PME, which
1196      * does communication.  With this order, non-bonded and listed
1197      * force calculation imbalance can be balanced out by the domain
1198      * decomposition load balancing.
1199      */
1200
1201     if (!bUseOrEmulGPU)
1202     {
1203         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1204                      step, nrnb, wcycle);
1205     }
1206
1207     if (fr->efep != efepNO)
1208     {
1209         /* Calculate the local and non-local free energy interactions here.
1210          * Happens here on the CPU both with and without GPU.
1211          */
1212         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1213         {
1214             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1215                              fr, x, f, mdatoms,
1216                              inputrec->fepvals, lambda,
1217                              enerd, flags, nrnb, wcycle);
1218         }
1219
1220         if (DOMAINDECOMP(cr) &&
1221             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1222         {
1223             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1224                              fr, x, f, mdatoms,
1225                              inputrec->fepvals, lambda,
1226                              enerd, flags, nrnb, wcycle);
1227         }
1228     }
1229
1230     if (!bUseOrEmulGPU)
1231     {
1232         int aloc;
1233
1234         if (DOMAINDECOMP(cr))
1235         {
1236             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1237                          step, nrnb, wcycle);
1238         }
1239
1240         if (!bUseOrEmulGPU)
1241         {
1242             aloc = eintLocal;
1243         }
1244         else
1245         {
1246             aloc = eintNonlocal;
1247         }
1248
1249         /* Add all the non-bonded force to the normal force array.
1250          * This can be split into a local and a non-local part when overlapping
1251          * communication with calculation with domain decomposition.
1252          */
1253         wallcycle_stop(wcycle, ewcFORCE);
1254         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1255         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1256         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
1257         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1258         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1259         wallcycle_start_nocount(wcycle, ewcFORCE);
1260
1261         /* if there are multiple fshift output buffers reduce them */
1262         if ((flags & GMX_FORCE_VIRIAL) &&
1263             nbv->grp[aloc].nbl_lists.nnbl > 1)
1264         {
1265             /* This is not in a subcounter because it takes a
1266                negligible and constant-sized amount of time */
1267             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1268                                                      fr->fshift);
1269         }
1270     }
1271
1272     /* update QMMMrec, if necessary */
1273     if (fr->bQMMM)
1274     {
1275         update_QMMMrec(cr, fr, x, mdatoms, box);
1276     }
1277
1278     /* Compute the bonded and non-bonded energies and optionally forces */
1279     do_force_lowlevel(fr, inputrec, &(top->idef),
1280                       cr, nrnb, wcycle, mdatoms,
1281                       x, hist, f, enerd, fcd, top, fr->born,
1282                       bBornRadii, box,
1283                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1284                       flags, &cycles_pme);
1285
1286     wallcycle_stop(wcycle, ewcFORCE);
1287
1288     computeSpecialForces(cr, inputrec, step, t, wcycle,
1289                          fr->forceProviders, box, x, mdatoms, lambda,
1290                          flags, fr->f_novirsum, vir_force, enerd,
1291                          ed, bNS);
1292
1293     if (bUseOrEmulGPU)
1294     {
1295         /* wait for non-local forces (or calculate in emulation mode) */
1296         if (DOMAINDECOMP(cr))
1297         {
1298             if (bUseGPU)
1299             {
1300                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1301                 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1302                                        flags, eatNonlocal,
1303                                        enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1304                                        fr->fshift);
1305                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1306             }
1307             else
1308             {
1309                 wallcycle_start_nocount(wcycle, ewcFORCE);
1310                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1311                              step, nrnb, wcycle);
1312                 wallcycle_stop(wcycle, ewcFORCE);
1313             }
1314             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1315             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1316             /* skip the reduction if there was no non-local work to do */
1317             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1318             {
1319                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1320                                                nbv->nbat, f);
1321             }
1322             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1323             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1324         }
1325     }
1326
1327     if (DOMAINDECOMP(cr))
1328     {
1329         /* We are done with the CPU compute.
1330          * We will now communicate the non-local forces.
1331          * If we use a GPU this will overlap with GPU work, so in that case
1332          * we do not close the DD force balancing region here.
1333          */
1334         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1335         {
1336             ddCloseBalanceRegionCpu(cr->dd);
1337         }
1338         if (bDoForces)
1339         {
1340             wallcycle_start(wcycle, ewcMOVEF);
1341             dd_move_f(cr->dd, f, fr->fshift);
1342             wallcycle_stop(wcycle, ewcMOVEF);
1343         }
1344     }
1345
1346     if (bUseOrEmulGPU)
1347     {
1348         /* wait for local forces (or calculate in emulation mode) */
1349         if (bUseGPU)
1350         {
1351             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1352              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1353              * but even with a step of 0.1 ms the difference is less than 1%
1354              * of the step time.
1355              */
1356             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1357
1358             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1359             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1360                                    flags, eatLocal,
1361                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1362                                    fr->fshift);
1363             float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1364
1365             if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1366             {
1367                 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1368                 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1369                 {
1370                     /* We measured few cycles, it could be that the kernel
1371                      * and transfer finished earlier and there was no actual
1372                      * wait time, only API call overhead.
1373                      * Then the actual time could be anywhere between 0 and
1374                      * cycles_wait_est. We will use half of cycles_wait_est.
1375                      */
1376                     waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1377                 }
1378                 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1379             }
1380
1381             /* now clear the GPU outputs while we finish the step on the CPU */
1382             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1383             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1384             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1385
1386             /* Is dynamic pair-list pruning activated? */
1387             if (nbv->listParams->useDynamicPruning)
1388             {
1389                 /* We should not launch the rolling pruning kernel at a search
1390                  * step or just before search steps, since that's useless.
1391                  * Without domain decomposition we prune at even steps.
1392                  * With domain decomposition we alternate local and non-local
1393                  * pruning at even and odd steps.
1394                  */
1395                 int  numRollingParts     = nbv->listParams->numRollingParts;
1396                 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1397                 int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1398                 bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1399                 if (stepWithCurrentList > 0 &&
1400                     stepWithCurrentList < inputrec->nstlist - 1 &&
1401                     (stepIsEven || DOMAINDECOMP(cr)))
1402                 {
1403                     nbnxn_gpu_launch_kernel_pruneonly(fr->nbv->gpu_nbv,
1404                                                       stepIsEven ? eintLocal : eintNonlocal,
1405                                                       numRollingParts);
1406                 }
1407             }
1408             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1409             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1410         }
1411         else
1412         {
1413             wallcycle_start_nocount(wcycle, ewcFORCE);
1414             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1415                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1416                          step, nrnb, wcycle);
1417             wallcycle_stop(wcycle, ewcFORCE);
1418         }
1419         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1420         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1421         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1422                                        nbv->nbat, f);
1423         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1424         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1425     }
1426
1427     if (DOMAINDECOMP(cr))
1428     {
1429         dd_force_flop_stop(cr->dd, nrnb);
1430     }
1431
1432     if (bDoForces)
1433     {
1434         /* If we have NoVirSum forces, but we do not calculate the virial,
1435          * we sum fr->f_novirsum=f later.
1436          */
1437         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1438         {
1439             wallcycle_start(wcycle, ewcVSITESPREAD);
1440             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1441                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1442             wallcycle_stop(wcycle, ewcVSITESPREAD);
1443         }
1444
1445         if (flags & GMX_FORCE_VIRIAL)
1446         {
1447             /* Calculation of the virial must be done after vsites! */
1448             calc_virial(0, mdatoms->homenr, x, f,
1449                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1450         }
1451     }
1452
1453     if (PAR(cr) && !(cr->duty & DUTY_PME))
1454     {
1455         /* In case of node-splitting, the PP nodes receive the long-range
1456          * forces, virial and energy from the PME nodes here.
1457          */
1458         pme_receive_force_ener(cr, wcycle, enerd, fr);
1459     }
1460
1461     if (bDoForces)
1462     {
1463         post_process_forces(cr, step, nrnb, wcycle,
1464                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1465                             flags);
1466     }
1467
1468     if (flags & GMX_FORCE_ENERGY)
1469     {
1470         /* Sum the potential energy terms from group contributions */
1471         sum_epot(&(enerd->grpp), enerd->term);
1472
1473         if (!EI_TPI(inputrec->eI))
1474         {
1475             checkPotentialEnergyValidity(enerd);
1476         }
1477     }
1478 }
1479
1480 static void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1481                                t_inputrec *inputrec,
1482                                gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1483                                gmx_localtop_t *top,
1484                                gmx_groups_t *groups,
1485                                matrix box, rvec x[], history_t *hist,
1486                                PaddedRVecVector *force,
1487                                tensor vir_force,
1488                                t_mdatoms *mdatoms,
1489                                gmx_enerdata_t *enerd, t_fcdata *fcd,
1490                                real *lambda, t_graph *graph,
1491                                t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1492                                double t, gmx_edsam_t ed,
1493                                gmx_bool bBornRadii,
1494                                int flags,
1495                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1496                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1497 {
1498     int        cg0, cg1, i, j;
1499     double     mu[2*DIM];
1500     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1501     gmx_bool   bDoForces;
1502     float      cycles_pme;
1503
1504     const int  start  = 0;
1505     const int  homenr = mdatoms->homenr;
1506
1507     clear_mat(vir_force);
1508
1509     cg0 = 0;
1510     if (DOMAINDECOMP(cr))
1511     {
1512         cg1 = cr->dd->ncg_tot;
1513     }
1514     else
1515     {
1516         cg1 = top->cgs.nr;
1517     }
1518     if (fr->n_tpi > 0)
1519     {
1520         cg1--;
1521     }
1522
1523     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1524     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1525     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1526     bFillGrid      = (bNS && bStateChanged);
1527     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1528     bDoForces      = (flags & GMX_FORCE_FORCES);
1529
1530     if (bStateChanged)
1531     {
1532         update_forcerec(fr, box);
1533
1534         if (inputrecNeedMutot(inputrec))
1535         {
1536             /* Calculate total (local) dipole moment in a temporary common array.
1537              * This makes it possible to sum them over nodes faster.
1538              */
1539             calc_mu(start, homenr,
1540                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1541                     mu, mu+DIM);
1542         }
1543     }
1544
1545     if (fr->ePBC != epbcNONE)
1546     {
1547         /* Compute shift vectors every step,
1548          * because of pressure coupling or box deformation!
1549          */
1550         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1551         {
1552             calc_shifts(box, fr->shift_vec);
1553         }
1554
1555         if (bCalcCGCM)
1556         {
1557             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1558                                      &(top->cgs), x, fr->cg_cm);
1559             inc_nrnb(nrnb, eNR_CGCM, homenr);
1560             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1561         }
1562         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1563         {
1564             unshift_self(graph, box, x);
1565         }
1566     }
1567     else if (bCalcCGCM)
1568     {
1569         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1570         inc_nrnb(nrnb, eNR_CGCM, homenr);
1571     }
1572
1573     if (bCalcCGCM && gmx_debug_at)
1574     {
1575         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1576     }
1577
1578 #if GMX_MPI
1579     if (!(cr->duty & DUTY_PME))
1580     {
1581         /* Send particle coordinates to the pme nodes.
1582          * Since this is only implemented for domain decomposition
1583          * and domain decomposition does not use the graph,
1584          * we do not need to worry about shifting.
1585          */
1586         wallcycle_start(wcycle, ewcPP_PMESENDX);
1587         gmx_pme_send_coordinates(cr, box, x,
1588                                  lambda[efptCOUL], lambda[efptVDW],
1589                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1590                                  step);
1591         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1592     }
1593 #endif /* GMX_MPI */
1594
1595     /* Communicate coordinates and sum dipole if necessary */
1596     if (DOMAINDECOMP(cr))
1597     {
1598         wallcycle_start(wcycle, ewcMOVEX);
1599         dd_move_x(cr->dd, box, x);
1600         wallcycle_stop(wcycle, ewcMOVEX);
1601         /* No GPU support, no move_x overlap, so reopen the balance region here */
1602         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1603         {
1604             ddReopenBalanceRegionCpu(cr->dd);
1605         }
1606     }
1607
1608     if (inputrecNeedMutot(inputrec))
1609     {
1610         if (bStateChanged)
1611         {
1612             if (PAR(cr))
1613             {
1614                 gmx_sumd(2*DIM, mu, cr);
1615                 ddReopenBalanceRegionCpu(cr->dd);
1616             }
1617             for (i = 0; i < 2; i++)
1618             {
1619                 for (j = 0; j < DIM; j++)
1620                 {
1621                     fr->mu_tot[i][j] = mu[i*DIM + j];
1622                 }
1623             }
1624         }
1625         if (fr->efep == efepNO)
1626         {
1627             copy_rvec(fr->mu_tot[0], mu_tot);
1628         }
1629         else
1630         {
1631             for (j = 0; j < DIM; j++)
1632             {
1633                 mu_tot[j] =
1634                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1635             }
1636         }
1637     }
1638
1639     /* Reset energies */
1640     reset_enerdata(enerd);
1641     clear_rvecs(SHIFTS, fr->fshift);
1642
1643     if (bNS)
1644     {
1645         wallcycle_start(wcycle, ewcNS);
1646
1647         if (graph && bStateChanged)
1648         {
1649             /* Calculate intramolecular shift vectors to make molecules whole */
1650             mk_mshift(fplog, graph, fr->ePBC, box, x);
1651         }
1652
1653         /* Do the actual neighbour searching */
1654         ns(fplog, fr, box,
1655            groups, top, mdatoms,
1656            cr, nrnb, bFillGrid);
1657
1658         wallcycle_stop(wcycle, ewcNS);
1659     }
1660
1661     if (inputrec->implicit_solvent && bNS)
1662     {
1663         make_gb_nblist(cr, inputrec->gb_algorithm,
1664                        x, box, fr, &top->idef, graph, fr->born);
1665     }
1666
1667     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1668     {
1669         wallcycle_start(wcycle, ewcPPDURINGPME);
1670         dd_force_flop_start(cr->dd, nrnb);
1671     }
1672
1673     if (inputrec->bRot)
1674     {
1675         wallcycle_start(wcycle, ewcROT);
1676         do_rotation(cr, inputrec, box, x, t, step, bNS);
1677         wallcycle_stop(wcycle, ewcROT);
1678     }
1679
1680     /* Temporary solution until all routines take PaddedRVecVector */
1681     rvec *f = as_rvec_array(force->data());
1682
1683     /* Start the force cycle counter.
1684      * Note that a different counter is used for dynamic load balancing.
1685      */
1686     wallcycle_start(wcycle, ewcFORCE);
1687
1688     fr->f_novirsum = force;
1689     if (bDoForces)
1690     {
1691         /* If we need to compute the virial, we might need a separate
1692          * force buffer for algorithms for which the virial is calculated
1693          * separately, such as PME.
1694          */
1695         if (fr->bF_NoVirSum && (flags & GMX_FORCE_VIRIAL))
1696         {
1697             fr->f_novirsum = fr->forceBufferNoVirialSummation;
1698
1699             /* TODO: remove this - 1 when padding is properly implemented */
1700             clear_rvecs(fr->f_novirsum->size() - 1,
1701                         as_rvec_array(fr->f_novirsum->data()));
1702         }
1703
1704         /* Clear the short- and long-range forces */
1705         clear_rvecs(fr->natoms_force_constr, f);
1706
1707         clear_rvec(fr->vir_diag_posres);
1708     }
1709     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1710     {
1711         clear_pull_forces(inputrec->pull_work);
1712     }
1713
1714     /* update QMMMrec, if necessary */
1715     if (fr->bQMMM)
1716     {
1717         update_QMMMrec(cr, fr, x, mdatoms, box);
1718     }
1719
1720     /* Compute the bonded and non-bonded energies and optionally forces */
1721     do_force_lowlevel(fr, inputrec, &(top->idef),
1722                       cr, nrnb, wcycle, mdatoms,
1723                       x, hist, f, enerd, fcd, top, fr->born,
1724                       bBornRadii, box,
1725                       inputrec->fepvals, lambda,
1726                       graph, &(top->excls), fr->mu_tot,
1727                       flags,
1728                       &cycles_pme);
1729
1730     wallcycle_stop(wcycle, ewcFORCE);
1731
1732     if (DOMAINDECOMP(cr))
1733     {
1734         dd_force_flop_stop(cr->dd, nrnb);
1735
1736         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1737         {
1738             ddCloseBalanceRegionCpu(cr->dd);
1739         }
1740     }
1741
1742     computeSpecialForces(cr, inputrec, step, t, wcycle,
1743                          fr->forceProviders, box, x, mdatoms, lambda,
1744                          flags, fr->f_novirsum, vir_force, enerd,
1745                          ed, bNS);
1746
1747     if (bDoForces)
1748     {
1749         /* Communicate the forces */
1750         if (DOMAINDECOMP(cr))
1751         {
1752             wallcycle_start(wcycle, ewcMOVEF);
1753             dd_move_f(cr->dd, f, fr->fshift);
1754             /* Do we need to communicate the separate force array
1755              * for terms that do not contribute to the single sum virial?
1756              * Position restraints and electric fields do not introduce
1757              * inter-cg forces, only full electrostatics methods do.
1758              * When we do not calculate the virial, fr->f_novirsum = f,
1759              * so we have already communicated these forces.
1760              */
1761             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1762                 (flags & GMX_FORCE_VIRIAL))
1763             {
1764                 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1765             }
1766             wallcycle_stop(wcycle, ewcMOVEF);
1767         }
1768
1769         /* If we have NoVirSum forces, but we do not calculate the virial,
1770          * we sum fr->f_novirsum=f later.
1771          */
1772         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1773         {
1774             wallcycle_start(wcycle, ewcVSITESPREAD);
1775             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1776                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1777             wallcycle_stop(wcycle, ewcVSITESPREAD);
1778         }
1779
1780         if (flags & GMX_FORCE_VIRIAL)
1781         {
1782             /* Calculation of the virial must be done after vsites! */
1783             calc_virial(0, mdatoms->homenr, x, f,
1784                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1785         }
1786     }
1787
1788     if (PAR(cr) && !(cr->duty & DUTY_PME))
1789     {
1790         /* In case of node-splitting, the PP nodes receive the long-range
1791          * forces, virial and energy from the PME nodes here.
1792          */
1793         pme_receive_force_ener(cr, wcycle, enerd, fr);
1794     }
1795
1796     if (bDoForces)
1797     {
1798         post_process_forces(cr, step, nrnb, wcycle,
1799                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1800                             flags);
1801     }
1802
1803     if (flags & GMX_FORCE_ENERGY)
1804     {
1805         /* Sum the potential energy terms from group contributions */
1806         sum_epot(&(enerd->grpp), enerd->term);
1807
1808         if (!EI_TPI(inputrec->eI))
1809         {
1810             checkPotentialEnergyValidity(enerd);
1811         }
1812     }
1813
1814 }
1815
1816 void do_force(FILE *fplog, t_commrec *cr,
1817               t_inputrec *inputrec,
1818               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1819               gmx_localtop_t *top,
1820               gmx_groups_t *groups,
1821               matrix box, PaddedRVecVector *coordinates, history_t *hist,
1822               PaddedRVecVector *force,
1823               tensor vir_force,
1824               t_mdatoms *mdatoms,
1825               gmx_enerdata_t *enerd, t_fcdata *fcd,
1826               gmx::ArrayRef<real> lambda, t_graph *graph,
1827               t_forcerec *fr,
1828               gmx_vsite_t *vsite, rvec mu_tot,
1829               double t, gmx_edsam_t ed,
1830               gmx_bool bBornRadii,
1831               int flags,
1832               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1833               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1834 {
1835     /* modify force flag if not doing nonbonded */
1836     if (!fr->bNonbonded)
1837     {
1838         flags &= ~GMX_FORCE_NONBONDED;
1839     }
1840
1841     GMX_ASSERT(coordinates->size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
1842     GMX_ASSERT(force->size()       >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
1843
1844     rvec *x = as_rvec_array(coordinates->data());
1845
1846     switch (inputrec->cutoff_scheme)
1847     {
1848         case ecutsVERLET:
1849             do_force_cutsVERLET(fplog, cr, inputrec,
1850                                 step, nrnb, wcycle,
1851                                 top,
1852                                 groups,
1853                                 box, x, hist,
1854                                 force, vir_force,
1855                                 mdatoms,
1856                                 enerd, fcd,
1857                                 lambda.data(), graph,
1858                                 fr, fr->ic,
1859                                 vsite, mu_tot,
1860                                 t, ed,
1861                                 bBornRadii,
1862                                 flags,
1863                                 ddOpenBalanceRegion,
1864                                 ddCloseBalanceRegion);
1865             break;
1866         case ecutsGROUP:
1867             do_force_cutsGROUP(fplog, cr, inputrec,
1868                                step, nrnb, wcycle,
1869                                top,
1870                                groups,
1871                                box, x, hist,
1872                                force, vir_force,
1873                                mdatoms,
1874                                enerd, fcd,
1875                                lambda.data(), graph,
1876                                fr, vsite, mu_tot,
1877                                t, ed,
1878                                bBornRadii,
1879                                flags,
1880                                ddOpenBalanceRegion,
1881                                ddCloseBalanceRegion);
1882             break;
1883         default:
1884             gmx_incons("Invalid cut-off scheme passed!");
1885     }
1886
1887     /* In case we don't have constraints and are using GPUs, the next balancing
1888      * region starts here.
1889      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1890      * virial calculation and COM pulling, is not thus not included in
1891      * the balance timing, which is ok as most tasks do communication.
1892      */
1893     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1894     {
1895         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1896     }
1897 }
1898
1899
1900 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1901                         t_inputrec *ir, t_mdatoms *md,
1902                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1903                         t_forcerec *fr, gmx_localtop_t *top)
1904 {
1905     int             i, m, start, end;
1906     gmx_int64_t     step;
1907     real            dt = ir->delta_t;
1908     real            dvdl_dum;
1909     rvec           *savex;
1910
1911     /* We need to allocate one element extra, since we might use
1912      * (unaligned) 4-wide SIMD loads to access rvec entries.
1913      */
1914     snew(savex, state->natoms + 1);
1915
1916     start = 0;
1917     end   = md->homenr;
1918
1919     if (debug)
1920     {
1921         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1922                 start, md->homenr, end);
1923     }
1924     /* Do a first constrain to reset particles... */
1925     step = ir->init_step;
1926     if (fplog)
1927     {
1928         char buf[STEPSTRSIZE];
1929         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1930                 gmx_step_str(step, buf));
1931     }
1932     dvdl_dum = 0;
1933
1934     /* constrain the current position */
1935     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1936               ir, cr, step, 0, 1.0, md,
1937               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1938               fr->bMolPBC, state->box,
1939               state->lambda[efptBONDED], &dvdl_dum,
1940               nullptr, nullptr, nrnb, econqCoord);
1941     if (EI_VV(ir->eI))
1942     {
1943         /* constrain the inital velocity, and save it */
1944         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1945         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1946                   ir, cr, step, 0, 1.0, md,
1947                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1948                   fr->bMolPBC, state->box,
1949                   state->lambda[efptBONDED], &dvdl_dum,
1950                   nullptr, nullptr, nrnb, econqVeloc);
1951     }
1952     /* constrain the inital velocities at t-dt/2 */
1953     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1954     {
1955         for (i = start; (i < end); i++)
1956         {
1957             for (m = 0; (m < DIM); m++)
1958             {
1959                 /* Reverse the velocity */
1960                 state->v[i][m] = -state->v[i][m];
1961                 /* Store the position at t-dt in buf */
1962                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1963             }
1964         }
1965         /* Shake the positions at t=-dt with the positions at t=0
1966          * as reference coordinates.
1967          */
1968         if (fplog)
1969         {
1970             char buf[STEPSTRSIZE];
1971             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1972                     gmx_step_str(step, buf));
1973         }
1974         dvdl_dum = 0;
1975         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1976                   ir, cr, step, -1, 1.0, md,
1977                   as_rvec_array(state->x.data()), savex, nullptr,
1978                   fr->bMolPBC, state->box,
1979                   state->lambda[efptBONDED], &dvdl_dum,
1980                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
1981
1982         for (i = start; i < end; i++)
1983         {
1984             for (m = 0; m < DIM; m++)
1985             {
1986                 /* Re-reverse the velocities */
1987                 state->v[i][m] = -state->v[i][m];
1988             }
1989         }
1990     }
1991     sfree(savex);
1992 }
1993
1994
1995 static void
1996 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
1997                 double *enerout, double *virout)
1998 {
1999     double enersum, virsum;
2000     double invscale, invscale2, invscale3;
2001     double r, ea, eb, ec, pa, pb, pc, pd;
2002     double y0, f, g, h;
2003     int    ri, offset;
2004     double tabfactor;
2005
2006     invscale  = 1.0/scale;
2007     invscale2 = invscale*invscale;
2008     invscale3 = invscale*invscale2;
2009
2010     /* Following summation derived from cubic spline definition,
2011      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2012      * the cubic spline.  We first calculate the negative of the
2013      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2014      * add the more standard, abrupt cutoff correction to that result,
2015      * yielding the long-range correction for a switched function.  We
2016      * perform both the pressure and energy loops at the same time for
2017      * simplicity, as the computational cost is low. */
2018
2019     if (offstart == 0)
2020     {
2021         /* Since the dispersion table has been scaled down a factor
2022          * 6.0 and the repulsion a factor 12.0 to compensate for the
2023          * c6/c12 parameters inside nbfp[] being scaled up (to save
2024          * flops in kernels), we need to correct for this.
2025          */
2026         tabfactor = 6.0;
2027     }
2028     else
2029     {
2030         tabfactor = 12.0;
2031     }
2032
2033     enersum = 0.0;
2034     virsum  = 0.0;
2035     for (ri = rstart; ri < rend; ++ri)
2036     {
2037         r  = ri*invscale;
2038         ea = invscale3;
2039         eb = 2.0*invscale2*r;
2040         ec = invscale*r*r;
2041
2042         pa = invscale3;
2043         pb = 3.0*invscale2*r;
2044         pc = 3.0*invscale*r*r;
2045         pd = r*r*r;
2046
2047         /* this "8" is from the packing in the vdwtab array - perhaps
2048            should be defined? */
2049
2050         offset = 8*ri + offstart;
2051         y0     = vdwtab[offset];
2052         f      = vdwtab[offset+1];
2053         g      = vdwtab[offset+2];
2054         h      = vdwtab[offset+3];
2055
2056         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2057         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2058     }
2059     *enerout = 4.0*M_PI*enersum*tabfactor;
2060     *virout  = 4.0*M_PI*virsum*tabfactor;
2061 }
2062
2063 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2064 {
2065     double   eners[2], virs[2], enersum, virsum;
2066     double   r0, rc3, rc9;
2067     int      ri0, ri1, i;
2068     real     scale, *vdwtab;
2069
2070     fr->enershiftsix    = 0;
2071     fr->enershifttwelve = 0;
2072     fr->enerdiffsix     = 0;
2073     fr->enerdifftwelve  = 0;
2074     fr->virdiffsix      = 0;
2075     fr->virdifftwelve   = 0;
2076
2077     const interaction_const_t *ic = fr->ic;
2078
2079     if (eDispCorr != edispcNO)
2080     {
2081         for (i = 0; i < 2; i++)
2082         {
2083             eners[i] = 0;
2084             virs[i]  = 0;
2085         }
2086         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2087             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2088             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2089             (ic->vdwtype == evdwSHIFT) ||
2090             (ic->vdwtype == evdwSWITCH))
2091         {
2092             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2093                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2094                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2095             {
2096                 gmx_fatal(FARGS,
2097                           "With dispersion correction rvdw-switch can not be zero "
2098                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2099             }
2100
2101             /* TODO This code depends on the logic in tables.c that
2102                constructs the table layout, which should be made
2103                explicit in future cleanup. */
2104             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2105                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2106             scale  = fr->dispersionCorrectionTable->scale;
2107             vdwtab = fr->dispersionCorrectionTable->data;
2108
2109             /* Round the cut-offs to exact table values for precision */
2110             ri0  = static_cast<int>(floor(ic->rvdw_switch*scale));
2111             ri1  = static_cast<int>(ceil(ic->rvdw*scale));
2112
2113             /* The code below has some support for handling force-switching, i.e.
2114              * when the force (instead of potential) is switched over a limited
2115              * region. This leads to a constant shift in the potential inside the
2116              * switching region, which we can handle by adding a constant energy
2117              * term in the force-switch case just like when we do potential-shift.
2118              *
2119              * For now this is not enabled, but to keep the functionality in the
2120              * code we check separately for switch and shift. When we do force-switch
2121              * the shifting point is rvdw_switch, while it is the cutoff when we
2122              * have a classical potential-shift.
2123              *
2124              * For a pure potential-shift the potential has a constant shift
2125              * all the way out to the cutoff, and that is it. For other forms
2126              * we need to calculate the constant shift up to the point where we
2127              * start modifying the potential.
2128              */
2129             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2130
2131             r0   = ri0/scale;
2132             rc3  = r0*r0*r0;
2133             rc9  = rc3*rc3*rc3;
2134
2135             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2136                 (ic->vdwtype == evdwSHIFT))
2137             {
2138                 /* Determine the constant energy shift below rvdw_switch.
2139                  * Table has a scale factor since we have scaled it down to compensate
2140                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2141                  */
2142                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2143                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2144             }
2145             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2146             {
2147                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2148                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2149             }
2150
2151             /* Add the constant part from 0 to rvdw_switch.
2152              * This integration from 0 to rvdw_switch overcounts the number
2153              * of interactions by 1, as it also counts the self interaction.
2154              * We will correct for this later.
2155              */
2156             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2157             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2158
2159             /* Calculate the contribution in the range [r0,r1] where we
2160              * modify the potential. For a pure potential-shift modifier we will
2161              * have ri0==ri1, and there will not be any contribution here.
2162              */
2163             for (i = 0; i < 2; i++)
2164             {
2165                 enersum = 0;
2166                 virsum  = 0;
2167                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2168                 eners[i] -= enersum;
2169                 virs[i]  -= virsum;
2170             }
2171
2172             /* Alright: Above we compensated by REMOVING the parts outside r0
2173              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2174              *
2175              * Regardless of whether r0 is the point where we start switching,
2176              * or the cutoff where we calculated the constant shift, we include
2177              * all the parts we are missing out to infinity from r0 by
2178              * calculating the analytical dispersion correction.
2179              */
2180             eners[0] += -4.0*M_PI/(3.0*rc3);
2181             eners[1] +=  4.0*M_PI/(9.0*rc9);
2182             virs[0]  +=  8.0*M_PI/rc3;
2183             virs[1]  += -16.0*M_PI/(3.0*rc9);
2184         }
2185         else if (ic->vdwtype == evdwCUT ||
2186                  EVDW_PME(ic->vdwtype) ||
2187                  ic->vdwtype == evdwUSER)
2188         {
2189             if (ic->vdwtype == evdwUSER && fplog)
2190             {
2191                 fprintf(fplog,
2192                         "WARNING: using dispersion correction with user tables\n");
2193             }
2194
2195             /* Note that with LJ-PME, the dispersion correction is multiplied
2196              * by the difference between the actual C6 and the value of C6
2197              * that would produce the combination rule.
2198              * This means the normal energy and virial difference formulas
2199              * can be used here.
2200              */
2201
2202             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2203             rc9  = rc3*rc3*rc3;
2204             /* Contribution beyond the cut-off */
2205             eners[0] += -4.0*M_PI/(3.0*rc3);
2206             eners[1] +=  4.0*M_PI/(9.0*rc9);
2207             if (ic->vdw_modifier == eintmodPOTSHIFT)
2208             {
2209                 /* Contribution within the cut-off */
2210                 eners[0] += -4.0*M_PI/(3.0*rc3);
2211                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2212             }
2213             /* Contribution beyond the cut-off */
2214             virs[0]  +=  8.0*M_PI/rc3;
2215             virs[1]  += -16.0*M_PI/(3.0*rc9);
2216         }
2217         else
2218         {
2219             gmx_fatal(FARGS,
2220                       "Dispersion correction is not implemented for vdw-type = %s",
2221                       evdw_names[ic->vdwtype]);
2222         }
2223
2224         /* When we deprecate the group kernels the code below can go too */
2225         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2226         {
2227             /* Calculate self-interaction coefficient (assuming that
2228              * the reciprocal-space contribution is constant in the
2229              * region that contributes to the self-interaction).
2230              */
2231             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2232
2233             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2234             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2235         }
2236
2237         fr->enerdiffsix    = eners[0];
2238         fr->enerdifftwelve = eners[1];
2239         /* The 0.5 is due to the Gromacs definition of the virial */
2240         fr->virdiffsix     = 0.5*virs[0];
2241         fr->virdifftwelve  = 0.5*virs[1];
2242     }
2243 }
2244
2245 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2246                    matrix box, real lambda, tensor pres, tensor virial,
2247                    real *prescorr, real *enercorr, real *dvdlcorr)
2248 {
2249     gmx_bool bCorrAll, bCorrPres;
2250     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2251     int      m;
2252
2253     *prescorr = 0;
2254     *enercorr = 0;
2255     *dvdlcorr = 0;
2256
2257     clear_mat(virial);
2258     clear_mat(pres);
2259
2260     if (ir->eDispCorr != edispcNO)
2261     {
2262         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2263                      ir->eDispCorr == edispcAllEnerPres);
2264         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2265                      ir->eDispCorr == edispcAllEnerPres);
2266
2267         invvol = 1/det(box);
2268         if (fr->n_tpi)
2269         {
2270             /* Only correct for the interactions with the inserted molecule */
2271             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2272             ninter = fr->n_tpi;
2273         }
2274         else
2275         {
2276             dens   = fr->numAtomsForDispersionCorrection*invvol;
2277             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2278         }
2279
2280         if (ir->efep == efepNO)
2281         {
2282             avcsix    = fr->avcsix[0];
2283             avctwelve = fr->avctwelve[0];
2284         }
2285         else
2286         {
2287             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2288             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2289         }
2290
2291         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2292         *enercorr += avcsix*enerdiff;
2293         dvdlambda  = 0.0;
2294         if (ir->efep != efepNO)
2295         {
2296             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2297         }
2298         if (bCorrAll)
2299         {
2300             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2301             *enercorr += avctwelve*enerdiff;
2302             if (fr->efep != efepNO)
2303             {
2304                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2305             }
2306         }
2307
2308         if (bCorrPres)
2309         {
2310             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2311             if (ir->eDispCorr == edispcAllEnerPres)
2312             {
2313                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2314             }
2315             /* The factor 2 is because of the Gromacs virial definition */
2316             spres = -2.0*invvol*svir*PRESFAC;
2317
2318             for (m = 0; m < DIM; m++)
2319             {
2320                 virial[m][m] += svir;
2321                 pres[m][m]   += spres;
2322             }
2323             *prescorr += spres;
2324         }
2325
2326         /* Can't currently control when it prints, for now, just print when degugging */
2327         if (debug)
2328         {
2329             if (bCorrAll)
2330             {
2331                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2332                         avcsix, avctwelve);
2333             }
2334             if (bCorrPres)
2335             {
2336                 fprintf(debug,
2337                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2338                         *enercorr, spres, svir);
2339             }
2340             else
2341             {
2342                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2343             }
2344         }
2345
2346         if (fr->efep != efepNO)
2347         {
2348             *dvdlcorr += dvdlambda;
2349         }
2350     }
2351 }
2352
2353 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2354                             const gmx_mtop_t *mtop, rvec x[],
2355                             gmx_bool bFirst)
2356 {
2357     t_graph        *graph;
2358     int             mb, as, mol;
2359     gmx_molblock_t *molb;
2360
2361     if (bFirst && fplog)
2362     {
2363         fprintf(fplog, "Removing pbc first time\n");
2364     }
2365
2366     snew(graph, 1);
2367     as = 0;
2368     for (mb = 0; mb < mtop->nmolblock; mb++)
2369     {
2370         molb = &mtop->molblock[mb];
2371         if (molb->natoms_mol == 1 ||
2372             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2373         {
2374             /* Just one atom or charge group in the molecule, no PBC required */
2375             as += molb->nmol*molb->natoms_mol;
2376         }
2377         else
2378         {
2379             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2380             mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2381                            0, molb->natoms_mol, FALSE, FALSE, graph);
2382
2383             for (mol = 0; mol < molb->nmol; mol++)
2384             {
2385                 mk_mshift(fplog, graph, ePBC, box, x+as);
2386
2387                 shift_self(graph, box, x+as);
2388                 /* The molecule is whole now.
2389                  * We don't need the second mk_mshift call as in do_pbc_first,
2390                  * since we no longer need this graph.
2391                  */
2392
2393                 as += molb->natoms_mol;
2394             }
2395             done_graph(graph);
2396         }
2397     }
2398     sfree(graph);
2399 }
2400
2401 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2402                        const gmx_mtop_t *mtop, rvec x[])
2403 {
2404     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2405 }
2406
2407 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2408                  const gmx_mtop_t *mtop, rvec x[])
2409 {
2410     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2411 }
2412
2413 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2414 {
2415     int t, nth;
2416     nth = gmx_omp_nthreads_get(emntDefault);
2417
2418 #pragma omp parallel for num_threads(nth) schedule(static)
2419     for (t = 0; t < nth; t++)
2420     {
2421         try
2422         {
2423             int offset, len;
2424
2425             offset = (natoms*t    )/nth;
2426             len    = (natoms*(t + 1))/nth - offset;
2427             put_atoms_in_box(ePBC, box, len, x + offset);
2428         }
2429         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2430     }
2431 }
2432
2433 // TODO This can be cleaned up a lot, and move back to runner.cpp
2434 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2435                 const t_inputrec *inputrec,
2436                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2437                 gmx_walltime_accounting_t walltime_accounting,
2438                 nonbonded_verlet_t *nbv,
2439                 gmx_pme_t *pme,
2440                 gmx_bool bWriteStat)
2441 {
2442     t_nrnb *nrnb_tot = nullptr;
2443     double  delta_t  = 0;
2444     double  nbfs     = 0, mflop = 0;
2445     double  elapsed_time,
2446             elapsed_time_over_all_ranks,
2447             elapsed_time_over_all_threads,
2448             elapsed_time_over_all_threads_over_all_ranks;
2449     /* Control whether it is valid to print a report. Only the
2450        simulation master may print, but it should not do so if the run
2451        terminated e.g. before a scheduled reset step. This is
2452        complicated by the fact that PME ranks are unaware of the
2453        reason why they were sent a pmerecvqxFINISH. To avoid
2454        communication deadlocks, we always do the communication for the
2455        report, even if we've decided not to write the report, because
2456        how long it takes to finish the run is not important when we've
2457        decided not to report on the simulation performance. */
2458     bool    printReport = SIMMASTER(cr);
2459
2460     if (!walltime_accounting_get_valid_finish(walltime_accounting))
2461     {
2462         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2463         printReport = false;
2464     }
2465
2466     if (cr->nnodes > 1)
2467     {
2468         snew(nrnb_tot, 1);
2469 #if GMX_MPI
2470         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2471                       cr->mpi_comm_mysim);
2472 #endif
2473     }
2474     else
2475     {
2476         nrnb_tot = nrnb;
2477     }
2478
2479     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2480     elapsed_time_over_all_ranks                  = elapsed_time;
2481     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2482     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2483 #if GMX_MPI
2484     if (cr->nnodes > 1)
2485     {
2486         /* reduce elapsed_time over all MPI ranks in the current simulation */
2487         MPI_Allreduce(&elapsed_time,
2488                       &elapsed_time_over_all_ranks,
2489                       1, MPI_DOUBLE, MPI_SUM,
2490                       cr->mpi_comm_mysim);
2491         elapsed_time_over_all_ranks /= cr->nnodes;
2492         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2493          * current simulation. */
2494         MPI_Allreduce(&elapsed_time_over_all_threads,
2495                       &elapsed_time_over_all_threads_over_all_ranks,
2496                       1, MPI_DOUBLE, MPI_SUM,
2497                       cr->mpi_comm_mysim);
2498     }
2499 #endif
2500
2501     if (printReport)
2502     {
2503         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2504     }
2505     if (cr->nnodes > 1)
2506     {
2507         sfree(nrnb_tot);
2508     }
2509
2510     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2511     {
2512         print_dd_statistics(cr, inputrec, fplog);
2513     }
2514
2515     /* TODO Move the responsibility for any scaling by thread counts
2516      * to the code that handled the thread region, so that there's a
2517      * mechanism to keep cycle counting working during the transition
2518      * to task parallelism. */
2519     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2520     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2521     wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2522     auto cycle_sum(wallcycle_sum(cr, wcycle));
2523
2524     if (printReport)
2525     {
2526         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2527         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2528         if (pme_gpu_task_enabled(pme))
2529         {
2530             pme_gpu_get_timings(pme, &pme_gpu_timings);
2531         }
2532         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2533                         elapsed_time_over_all_ranks,
2534                         wcycle, cycle_sum,
2535                         nbnxn_gpu_timings,
2536                         &pme_gpu_timings);
2537
2538         if (EI_DYNAMICS(inputrec->eI))
2539         {
2540             delta_t = inputrec->delta_t;
2541         }
2542
2543         if (fplog)
2544         {
2545             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2546                        elapsed_time_over_all_ranks,
2547                        walltime_accounting_get_nsteps_done(walltime_accounting),
2548                        delta_t, nbfs, mflop);
2549         }
2550         if (bWriteStat)
2551         {
2552             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2553                        elapsed_time_over_all_ranks,
2554                        walltime_accounting_get_nsteps_done(walltime_accounting),
2555                        delta_t, nbfs, mflop);
2556         }
2557     }
2558 }
2559
2560 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2561 {
2562     /* this function works, but could probably use a logic rewrite to keep all the different
2563        types of efep straight. */
2564
2565     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2566     {
2567         return;
2568     }
2569
2570     t_lambda *fep = ir->fepvals;
2571     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2572                                             if checkpoint is set -- a kludge is in for now
2573                                             to prevent this.*/
2574
2575     for (int i = 0; i < efptNR; i++)
2576     {
2577         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2578         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2579         {
2580             lambda[i] = fep->init_lambda;
2581             if (lam0)
2582             {
2583                 lam0[i] = lambda[i];
2584             }
2585         }
2586         else
2587         {
2588             lambda[i] = fep->all_lambda[i][*fep_state];
2589             if (lam0)
2590             {
2591                 lam0[i] = lambda[i];
2592             }
2593         }
2594     }
2595     if (ir->bSimTemp)
2596     {
2597         /* need to rescale control temperatures to match current state */
2598         for (int i = 0; i < ir->opts.ngtc; i++)
2599         {
2600             if (ir->opts.ref_t[i] > 0)
2601             {
2602                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2603             }
2604         }
2605     }
2606
2607     /* Send to the log the information on the current lambdas */
2608     if (fplog != nullptr)
2609     {
2610         fprintf(fplog, "Initial vector of lambda components:[ ");
2611         for (const auto &l : lambda)
2612         {
2613             fprintf(fplog, "%10.4f ", l);
2614         }
2615         fprintf(fplog, "]\n");
2616     }
2617     return;
2618 }
2619
2620
2621 void init_md(FILE *fplog,
2622              t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2623              t_inputrec *ir, const gmx_output_env_t *oenv,
2624              const MdrunOptions &mdrunOptions,
2625              double *t, double *t0,
2626              t_state *globalState, double *lam0,
2627              t_nrnb *nrnb, gmx_mtop_t *mtop,
2628              gmx_update_t **upd,
2629              int nfile, const t_filenm fnm[],
2630              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2631              tensor force_vir, tensor shake_vir, rvec mu_tot,
2632              gmx_bool *bSimAnn, t_vcm **vcm,
2633              gmx_wallcycle_t wcycle)
2634 {
2635     int  i;
2636
2637     /* Initial values */
2638     *t = *t0       = ir->init_t;
2639
2640     *bSimAnn = FALSE;
2641     for (i = 0; i < ir->opts.ngtc; i++)
2642     {
2643         /* set bSimAnn if any group is being annealed */
2644         if (ir->opts.annealing[i] != eannNO)
2645         {
2646             *bSimAnn = TRUE;
2647         }
2648     }
2649
2650     /* Initialize lambda variables */
2651     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2652      * We currently need to call initialize_lambdas on non-master ranks
2653      * to initialize lam0.
2654      */
2655     if (MASTER(cr))
2656     {
2657         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2658     }
2659     else
2660     {
2661         int                      tmpFepState;
2662         std::array<real, efptNR> tmpLambda;
2663         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2664     }
2665
2666     // TODO upd is never NULL in practice, but the analysers don't know that
2667     if (upd)
2668     {
2669         *upd = init_update(ir);
2670     }
2671     if (*bSimAnn)
2672     {
2673         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2674     }
2675
2676     if (vcm != nullptr)
2677     {
2678         *vcm = init_vcm(fplog, &mtop->groups, ir);
2679     }
2680
2681     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2682     {
2683         if (ir->etc == etcBERENDSEN)
2684         {
2685             please_cite(fplog, "Berendsen84a");
2686         }
2687         if (ir->etc == etcVRESCALE)
2688         {
2689             please_cite(fplog, "Bussi2007a");
2690         }
2691         if (ir->eI == eiSD1)
2692         {
2693             please_cite(fplog, "Goga2012");
2694         }
2695     }
2696     init_nrnb(nrnb);
2697
2698     if (nfile != -1)
2699     {
2700         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2701
2702         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2703                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2704     }
2705
2706     /* Initiate variables */
2707     clear_mat(force_vir);
2708     clear_mat(shake_vir);
2709     clear_rvec(mu_tot);
2710 }