10d67fca283510f7f3a88a669b071b5bb690f63f
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <assert.h>
44 #include <math.h>
45 #include <stdio.h>
46 #include <string.h>
47
48 #include <cstdint>
49
50 #include <array>
51
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
112
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
115
116 void print_time(FILE                     *out,
117                 gmx_walltime_accounting_t walltime_accounting,
118                 gmx_int64_t               step,
119                 t_inputrec               *ir,
120                 t_commrec gmx_unused     *cr)
121 {
122     time_t finish;
123     char   timebuf[STRLEN];
124     double dt, elapsed_seconds, time_per_step;
125     char   buf[48];
126
127 #if !GMX_THREAD_MPI
128     if (!PAR(cr))
129 #endif
130     {
131         fprintf(out, "\r");
132     }
133     fprintf(out, "step %s", gmx_step_str(step, buf));
134     fflush(out);
135
136     if ((step >= ir->nstlist))
137     {
138         double seconds_since_epoch = gmx_gettime();
139         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
140         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
141         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
142
143         if (ir->nsteps >= 0)
144         {
145             if (dt >= 300)
146             {
147                 finish = (time_t) (seconds_since_epoch + dt);
148                 gmx_ctime_r(&finish, timebuf, STRLEN);
149                 sprintf(buf, "%s", timebuf);
150                 buf[strlen(buf)-1] = '\0';
151                 fprintf(out, ", will finish %s", buf);
152             }
153             else
154             {
155                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
156             }
157         }
158         else
159         {
160             fprintf(out, " performance: %.1f ns/day    ",
161                     ir->delta_t/1000*24*60*60/time_per_step);
162         }
163     }
164 #if !GMX_THREAD_MPI
165     if (PAR(cr))
166     {
167         fprintf(out, "\n");
168     }
169 #endif
170
171     fflush(out);
172 }
173
174 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
175                          double the_time)
176 {
177     char   time_string[STRLEN];
178
179     if (!fplog)
180     {
181         return;
182     }
183
184     {
185         int    i;
186         char   timebuf[STRLEN];
187         time_t temp_time = (time_t) the_time;
188
189         gmx_ctime_r(&temp_time, timebuf, STRLEN);
190         for (i = 0; timebuf[i] >= ' '; i++)
191         {
192             time_string[i] = timebuf[i];
193         }
194         time_string[i] = '\0';
195     }
196
197     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
198 }
199
200 void print_start(FILE *fplog, t_commrec *cr,
201                  gmx_walltime_accounting_t walltime_accounting,
202                  const char *name)
203 {
204     char buf[STRLEN];
205
206     sprintf(buf, "Started %s", name);
207     print_date_and_time(fplog, cr->nodeid, buf,
208                         walltime_accounting_get_start_time_stamp(walltime_accounting));
209 }
210
211 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
212 {
213     /* TODO: remove this - 1 when padding is properly implemented */
214     int         end  = forceToAdd->size() - 1;
215     const rvec *fAdd = as_rvec_array(forceToAdd->data());
216
217     // cppcheck-suppress unreadVariable
218     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
219 #pragma omp parallel for num_threads(nt) schedule(static)
220     for (int i = 0; i < end; i++)
221     {
222         rvec_inc(f[i], fAdd[i]);
223     }
224 }
225
226 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
227                         tensor vir_part, t_graph *graph, matrix box,
228                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
229 {
230     int    i;
231
232     /* The short-range virial from surrounding boxes */
233     clear_mat(vir_part);
234     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
235     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
236
237     /* Calculate partial virial, for local atoms only, based on short range.
238      * Total virial is computed in global_stat, called from do_md
239      */
240     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
241     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
242
243     /* Add position restraint contribution */
244     for (i = 0; i < DIM; i++)
245     {
246         vir_part[i][i] += fr->vir_diag_posres[i];
247     }
248
249     /* Add wall contribution */
250     for (i = 0; i < DIM; i++)
251     {
252         vir_part[i][ZZ] += fr->vir_wall_z[i];
253     }
254
255     if (debug)
256     {
257         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
258     }
259 }
260
261 static void pull_potential_wrapper(t_commrec *cr,
262                                    t_inputrec *ir,
263                                    matrix box, rvec x[],
264                                    rvec f[],
265                                    tensor vir_force,
266                                    t_mdatoms *mdatoms,
267                                    gmx_enerdata_t *enerd,
268                                    real *lambda,
269                                    double t,
270                                    gmx_wallcycle_t wcycle)
271 {
272     t_pbc  pbc;
273     real   dvdl;
274
275     /* Calculate the center of mass forces, this requires communication,
276      * which is why pull_potential is called close to other communication.
277      * The virial contribution is calculated directly,
278      * which is why we call pull_potential after calc_virial.
279      */
280     wallcycle_start(wcycle, ewcPULLPOT);
281     set_pbc(&pbc, ir->ePBC, box);
282     dvdl                     = 0;
283     enerd->term[F_COM_PULL] +=
284         pull_potential(ir->pull_work, mdatoms, &pbc,
285                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
286     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
287     wallcycle_stop(wcycle, ewcPULLPOT);
288 }
289
290 static void pme_receive_force_ener(t_commrec      *cr,
291                                    gmx_wallcycle_t wcycle,
292                                    gmx_enerdata_t *enerd,
293                                    t_forcerec     *fr)
294 {
295     real   e_q, e_lj, dvdl_q, dvdl_lj;
296     float  cycles_ppdpme, cycles_seppme;
297
298     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
299     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
300
301     /* In case of node-splitting, the PP nodes receive the long-range
302      * forces, virial and energy from the PME nodes here.
303      */
304     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
305     dvdl_q  = 0;
306     dvdl_lj = 0;
307     gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
308                       fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
309                       &cycles_seppme);
310     enerd->term[F_COUL_RECIP] += e_q;
311     enerd->term[F_LJ_RECIP]   += e_lj;
312     enerd->dvdl_lin[efptCOUL] += dvdl_q;
313     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
314
315     if (wcycle)
316     {
317         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
318     }
319     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
320 }
321
322 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
323                                gmx_int64_t step, real forceTolerance,
324                                const rvec *x, const rvec *f)
325 {
326     real           force2Tolerance = gmx::square(forceTolerance);
327     std::uintmax_t numNonFinite    = 0;
328     for (int i = 0; i < md->homenr; i++)
329     {
330         real force2    = norm2(f[i]);
331         bool nonFinite = !std::isfinite(force2);
332         if (force2 >= force2Tolerance || nonFinite)
333         {
334             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
335                     step,
336                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
337         }
338         if (nonFinite)
339         {
340             numNonFinite++;
341         }
342     }
343     if (numNonFinite > 0)
344     {
345         /* Note that with MPI this fatal call on one rank might interrupt
346          * the printing on other ranks. But we can only avoid that with
347          * an expensive MPI barrier that we would need at each step.
348          */
349         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
350     }
351 }
352
353 static void post_process_forces(t_commrec *cr,
354                                 gmx_int64_t step,
355                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
356                                 gmx_localtop_t *top,
357                                 matrix box, rvec x[],
358                                 rvec f[],
359                                 tensor vir_force,
360                                 t_mdatoms *mdatoms,
361                                 t_graph *graph,
362                                 t_forcerec *fr, gmx_vsite_t *vsite,
363                                 int flags)
364 {
365     if (fr->bF_NoVirSum)
366     {
367         if (vsite)
368         {
369             /* Spread the mesh force on virtual sites to the other particles...
370              * This is parallellized. MPI communication is performed
371              * if the constructing atoms aren't local.
372              */
373             wallcycle_start(wcycle, ewcVSITESPREAD);
374             spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
375                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
376                            nrnb,
377                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
378             wallcycle_stop(wcycle, ewcVSITESPREAD);
379         }
380         if (flags & GMX_FORCE_VIRIAL)
381         {
382             /* Now add the forces, this is local */
383             sum_forces(f, fr->f_novirsum);
384
385             if (EEL_FULL(fr->eeltype))
386             {
387                 /* Add the mesh contribution to the virial */
388                 m_add(vir_force, fr->vir_el_recip, vir_force);
389             }
390             if (EVDW_PME(fr->vdwtype))
391             {
392                 /* Add the mesh contribution to the virial */
393                 m_add(vir_force, fr->vir_lj_recip, vir_force);
394             }
395             if (debug)
396             {
397                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
398             }
399         }
400     }
401
402     if (fr->print_force >= 0)
403     {
404         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
405     }
406 }
407
408 static void do_nb_verlet(t_forcerec *fr,
409                          interaction_const_t *ic,
410                          gmx_enerdata_t *enerd,
411                          int flags, int ilocality,
412                          int clearF,
413                          t_nrnb *nrnb,
414                          gmx_wallcycle_t wcycle)
415 {
416     int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
417     nonbonded_verlet_group_t  *nbvg;
418     gmx_bool                   bUsingGpuKernels;
419
420     if (!(flags & GMX_FORCE_NONBONDED))
421     {
422         /* skip non-bonded calculation */
423         return;
424     }
425
426     nbvg = &fr->nbv->grp[ilocality];
427
428     /* GPU kernel launch overhead is already timed separately */
429     if (fr->cutoff_scheme != ecutsVERLET)
430     {
431         gmx_incons("Invalid cut-off scheme passed!");
432     }
433
434     bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
435
436     if (!bUsingGpuKernels)
437     {
438         wallcycle_sub_start(wcycle, ewcsNONBONDED);
439     }
440     switch (nbvg->kernel_type)
441     {
442         case nbnxnk4x4_PlainC:
443         case nbnxnk4xN_SIMD_4xN:
444         case nbnxnk4xN_SIMD_2xNN:
445             nbnxn_kernel_cpu(nbvg,
446                              ic,
447                              fr->shift_vec,
448                              flags,
449                              clearF,
450                              fr->fshift[0],
451                              enerd->grpp.ener[egCOULSR],
452                              fr->bBHAM ?
453                              enerd->grpp.ener[egBHAMSR] :
454                              enerd->grpp.ener[egLJSR]);
455             break;
456
457         case nbnxnk8x8x8_GPU:
458             nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
459             break;
460
461         case nbnxnk8x8x8_PlainC:
462             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
463                                  nbvg->nbat, ic,
464                                  fr->shift_vec,
465                                  flags,
466                                  clearF,
467                                  nbvg->nbat->out[0].f,
468                                  fr->fshift[0],
469                                  enerd->grpp.ener[egCOULSR],
470                                  fr->bBHAM ?
471                                  enerd->grpp.ener[egBHAMSR] :
472                                  enerd->grpp.ener[egLJSR]);
473             break;
474
475         default:
476             gmx_incons("Invalid nonbonded kernel type passed!");
477
478     }
479     if (!bUsingGpuKernels)
480     {
481         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
482     }
483
484     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
485     {
486         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
487     }
488     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
489              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
490     {
491         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
492     }
493     else
494     {
495         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
496     }
497     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
498     if (flags & GMX_FORCE_ENERGY)
499     {
500         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
501         enr_nbnxn_kernel_ljc += 1;
502         enr_nbnxn_kernel_lj  += 1;
503     }
504
505     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
506              nbvg->nbl_lists.natpair_ljq);
507     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
508              nbvg->nbl_lists.natpair_lj);
509     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
510     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
511              nbvg->nbl_lists.natpair_q);
512
513     if (ic->vdw_modifier == eintmodFORCESWITCH)
514     {
515         /* We add up the switch cost separately */
516         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
517                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
518     }
519     if (ic->vdw_modifier == eintmodPOTSWITCH)
520     {
521         /* We add up the switch cost separately */
522         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
523                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
524     }
525     if (ic->vdwtype == evdwPME)
526     {
527         /* We add up the LJ Ewald cost separately */
528         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
529                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
530     }
531 }
532
533 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
534                              t_forcerec           *fr,
535                              rvec                  x[],
536                              rvec                  f[],
537                              t_mdatoms            *mdatoms,
538                              t_lambda             *fepvals,
539                              real                 *lambda,
540                              gmx_enerdata_t       *enerd,
541                              int                   flags,
542                              t_nrnb               *nrnb,
543                              gmx_wallcycle_t       wcycle)
544 {
545     int              donb_flags;
546     nb_kernel_data_t kernel_data;
547     real             lam_i[efptNR];
548     real             dvdl_nb[efptNR];
549     int              th;
550     int              i, j;
551
552     donb_flags = 0;
553     /* Add short-range interactions */
554     donb_flags |= GMX_NONBONDED_DO_SR;
555
556     /* Currently all group scheme kernels always calculate (shift-)forces */
557     if (flags & GMX_FORCE_FORCES)
558     {
559         donb_flags |= GMX_NONBONDED_DO_FORCE;
560     }
561     if (flags & GMX_FORCE_VIRIAL)
562     {
563         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
564     }
565     if (flags & GMX_FORCE_ENERGY)
566     {
567         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
568     }
569
570     kernel_data.flags  = donb_flags;
571     kernel_data.lambda = lambda;
572     kernel_data.dvdl   = dvdl_nb;
573
574     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
575     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
576
577     /* reset free energy components */
578     for (i = 0; i < efptNR; i++)
579     {
580         dvdl_nb[i]  = 0;
581     }
582
583     assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
584
585     wallcycle_sub_start(wcycle, ewcsNONBONDED);
586 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
587     for (th = 0; th < nbl_lists->nnbl; th++)
588     {
589         try
590         {
591             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
592                                       x, f, fr, mdatoms, &kernel_data, nrnb);
593         }
594         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
595     }
596
597     if (fepvals->sc_alpha != 0)
598     {
599         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
600         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
601     }
602     else
603     {
604         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
605         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
606     }
607
608     /* If we do foreign lambda and we have soft-core interactions
609      * we have to recalculate the (non-linear) energies contributions.
610      */
611     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
612     {
613         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
614         kernel_data.lambda         = lam_i;
615         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
616         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
617         /* Note that we add to kernel_data.dvdl, but ignore the result */
618
619         for (i = 0; i < enerd->n_lambda; i++)
620         {
621             for (j = 0; j < efptNR; j++)
622             {
623                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
624             }
625             reset_foreign_enerdata(enerd);
626 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
627             for (th = 0; th < nbl_lists->nnbl; th++)
628             {
629                 try
630                 {
631                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
632                                               x, f, fr, mdatoms, &kernel_data, nrnb);
633                 }
634                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
635             }
636
637             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
638             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
639         }
640     }
641
642     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
643 }
644
645 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
646 {
647     return nbv != nullptr && nbv->bUseGPU;
648 }
649
650 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
651 {
652     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
653
654     /* Note that we would like to avoid this conditional by putting it
655      * into the omp pragma instead, but then we still take the full
656      * omp parallel for overhead (at least with gcc5).
657      */
658     if (nth == 1)
659     {
660         for (int i = 0; i < n; i++)
661         {
662             clear_rvec(v[i]);
663         }
664     }
665     else
666     {
667 #pragma omp parallel for num_threads(nth) schedule(static)
668         for (int i = 0; i < n; i++)
669         {
670             clear_rvec(v[i]);
671         }
672     }
673 }
674
675 /*! \brief  This routine checks if the potential energy is finite.
676  *
677  * Note that passing this check does not guarantee finite forces,
678  * since those use slightly different arithmetics. But in most cases
679  * there is just a narrow coordinate range where forces are not finite
680  * and energies are finite.
681  *
682  * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
683  */
684 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
685 {
686     if (!std::isfinite(enerd->term[F_EPOT]))
687     {
688         gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
689                   enerd->term[F_EPOT],
690                   enerd->term[F_LJ],
691                   enerd->term[F_COUL_SR]);
692     }
693 }
694
695 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
696                          t_inputrec *inputrec,
697                          gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
698                          gmx_localtop_t *top,
699                          gmx_groups_t gmx_unused *groups,
700                          matrix box, rvec x[], history_t *hist,
701                          PaddedRVecVector *force,
702                          tensor vir_force,
703                          t_mdatoms *mdatoms,
704                          gmx_enerdata_t *enerd, t_fcdata *fcd,
705                          real *lambda, t_graph *graph,
706                          t_forcerec *fr, interaction_const_t *ic,
707                          gmx_vsite_t *vsite, rvec mu_tot,
708                          double t, gmx_edsam_t ed,
709                          gmx_bool bBornRadii,
710                          int flags,
711                          DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
712                          DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
713 {
714     int                 cg1, i, j;
715     double              mu[2*DIM];
716     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
717     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
718     gmx_bool            bDiffKernels = FALSE;
719     rvec                vzero, box_diag;
720     float               cycles_pme, cycles_wait_gpu;
721     nonbonded_verlet_t *nbv = fr->nbv;
722
723     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
724     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
725     bFillGrid     = (bNS && bStateChanged);
726     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
727     bDoForces     = (flags & GMX_FORCE_FORCES);
728     bUseGPU       = fr->nbv->bUseGPU;
729     bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu;
730
731     /* At a search step we need to start the first balancing region
732      * somewhere early inside the step after communication during domain
733      * decomposition (and not during the previous step as usual).
734      */
735     if (bNS &&
736         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
737     {
738         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
739     }
740
741     cycles_wait_gpu = 0;
742
743     const int start  = 0;
744     const int homenr = mdatoms->homenr;
745
746     clear_mat(vir_force);
747
748     if (DOMAINDECOMP(cr))
749     {
750         cg1 = cr->dd->ncg_tot;
751     }
752     else
753     {
754         cg1 = top->cgs.nr;
755     }
756     if (fr->n_tpi > 0)
757     {
758         cg1--;
759     }
760
761     if (bStateChanged)
762     {
763         update_forcerec(fr, box);
764
765         if (inputrecNeedMutot(inputrec))
766         {
767             /* Calculate total (local) dipole moment in a temporary common array.
768              * This makes it possible to sum them over nodes faster.
769              */
770             calc_mu(start, homenr,
771                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
772                     mu, mu+DIM);
773         }
774     }
775
776     if (fr->ePBC != epbcNONE)
777     {
778         /* Compute shift vectors every step,
779          * because of pressure coupling or box deformation!
780          */
781         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
782         {
783             calc_shifts(box, fr->shift_vec);
784         }
785
786         if (bCalcCGCM)
787         {
788             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
789             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
790         }
791         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
792         {
793             unshift_self(graph, box, x);
794         }
795     }
796
797     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
798                                  fr->shift_vec, nbv->grp[0].nbat);
799
800 #if GMX_MPI
801     if (!(cr->duty & DUTY_PME))
802     {
803         gmx_bool bBS;
804         matrix   boxs;
805
806         /* Send particle coordinates to the pme nodes.
807          * Since this is only implemented for domain decomposition
808          * and domain decomposition does not use the graph,
809          * we do not need to worry about shifting.
810          */
811
812         wallcycle_start(wcycle, ewcPP_PMESENDX);
813
814         bBS = (inputrec->nwall == 2);
815         if (bBS)
816         {
817             copy_mat(box, boxs);
818             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
819         }
820
821         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
822                                  lambda[efptCOUL], lambda[efptVDW],
823                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
824                                  step);
825
826         wallcycle_stop(wcycle, ewcPP_PMESENDX);
827     }
828 #endif /* GMX_MPI */
829
830     /* do gridding for pair search */
831     if (bNS)
832     {
833         if (graph && bStateChanged)
834         {
835             /* Calculate intramolecular shift vectors to make molecules whole */
836             mk_mshift(fplog, graph, fr->ePBC, box, x);
837         }
838
839         clear_rvec(vzero);
840         box_diag[XX] = box[XX][XX];
841         box_diag[YY] = box[YY][YY];
842         box_diag[ZZ] = box[ZZ][ZZ];
843
844         wallcycle_start(wcycle, ewcNS);
845         if (!fr->bDomDec)
846         {
847             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
848             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
849                               0, vzero, box_diag,
850                               0, mdatoms->homenr, -1, fr->cginfo, x,
851                               0, nullptr,
852                               nbv->grp[eintLocal].kernel_type,
853                               nbv->grp[eintLocal].nbat);
854             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
855         }
856         else
857         {
858             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
859             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
860                                        fr->cginfo, x,
861                                        nbv->grp[eintNonlocal].kernel_type,
862                                        nbv->grp[eintNonlocal].nbat);
863             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
864         }
865
866         if (nbv->ngrp == 1 ||
867             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
868         {
869             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
870                                nbv->nbs, mdatoms, fr->cginfo);
871         }
872         else
873         {
874             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
875                                nbv->nbs, mdatoms, fr->cginfo);
876             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
877                                nbv->nbs, mdatoms, fr->cginfo);
878         }
879         wallcycle_stop(wcycle, ewcNS);
880     }
881
882     /* initialize the GPU atom data and copy shift vector */
883     if (bUseGPU)
884     {
885         if (bNS)
886         {
887             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
888             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
889             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
890         }
891
892         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
893         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
894         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
895     }
896
897     /* do local pair search */
898     if (bNS)
899     {
900         wallcycle_start_nocount(wcycle, ewcNS);
901         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
902         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
903                             &top->excls,
904                             ic->rlist,
905                             nbv->min_ci_balanced,
906                             &nbv->grp[eintLocal].nbl_lists,
907                             eintLocal,
908                             nbv->grp[eintLocal].kernel_type,
909                             nrnb);
910         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
911
912         if (bUseGPU)
913         {
914             /* initialize local pair-list on the GPU */
915             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
916                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
917                                     eintLocal);
918         }
919         wallcycle_stop(wcycle, ewcNS);
920     }
921     else
922     {
923         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
924         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
925         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
926                                         nbv->grp[eintLocal].nbat);
927         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
928         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
929     }
930
931     if (bUseGPU)
932     {
933         if (DOMAINDECOMP(cr))
934         {
935             ddOpenBalanceRegionGpu(cr->dd);
936         }
937
938         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
939         /* launch local nonbonded F on GPU */
940         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
941                      nrnb, wcycle);
942         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
943     }
944
945     /* Communicate coordinates and sum dipole if necessary +
946        do non-local pair search */
947     if (DOMAINDECOMP(cr))
948     {
949         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
950                         nbv->grp[eintLocal].kernel_type);
951
952         if (bDiffKernels)
953         {
954             /* With GPU+CPU non-bonded calculations we need to copy
955              * the local coordinates to the non-local nbat struct
956              * (in CPU format) as the non-local kernel call also
957              * calculates the local - non-local interactions.
958              */
959             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
960             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
961             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
962                                             nbv->grp[eintNonlocal].nbat);
963             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
964             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
965         }
966
967         if (bNS)
968         {
969             wallcycle_start_nocount(wcycle, ewcNS);
970             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
971
972             if (bDiffKernels)
973             {
974                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
975             }
976
977             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
978                                 &top->excls,
979                                 ic->rlist,
980                                 nbv->min_ci_balanced,
981                                 &nbv->grp[eintNonlocal].nbl_lists,
982                                 eintNonlocal,
983                                 nbv->grp[eintNonlocal].kernel_type,
984                                 nrnb);
985
986             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
987
988             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
989             {
990                 /* initialize non-local pair-list on the GPU */
991                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
992                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
993                                         eintNonlocal);
994             }
995             wallcycle_stop(wcycle, ewcNS);
996         }
997         else
998         {
999             wallcycle_start(wcycle, ewcMOVEX);
1000             dd_move_x(cr->dd, box, x);
1001             wallcycle_stop(wcycle, ewcMOVEX);
1002
1003             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1004             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1005             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1006                                             nbv->grp[eintNonlocal].nbat);
1007             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1008             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1009         }
1010
1011         if (bUseGPU && !bDiffKernels)
1012         {
1013             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1014             /* launch non-local nonbonded F on GPU */
1015             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1016                          nrnb, wcycle);
1017             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1018         }
1019     }
1020
1021     if (bUseGPU)
1022     {
1023         /* launch D2H copy-back F */
1024         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1025         if (DOMAINDECOMP(cr) && !bDiffKernels)
1026         {
1027             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1028                                      flags, eatNonlocal);
1029         }
1030         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1031                                  flags, eatLocal);
1032         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1033     }
1034
1035     if (bStateChanged && inputrecNeedMutot(inputrec))
1036     {
1037         if (PAR(cr))
1038         {
1039             gmx_sumd(2*DIM, mu, cr);
1040             ddReopenBalanceRegionCpu(cr->dd);
1041         }
1042
1043         for (i = 0; i < 2; i++)
1044         {
1045             for (j = 0; j < DIM; j++)
1046             {
1047                 fr->mu_tot[i][j] = mu[i*DIM + j];
1048             }
1049         }
1050     }
1051     if (fr->efep == efepNO)
1052     {
1053         copy_rvec(fr->mu_tot[0], mu_tot);
1054     }
1055     else
1056     {
1057         for (j = 0; j < DIM; j++)
1058         {
1059             mu_tot[j] =
1060                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1061                 lambda[efptCOUL]*fr->mu_tot[1][j];
1062         }
1063     }
1064
1065     /* Reset energies */
1066     reset_enerdata(enerd);
1067     clear_rvecs(SHIFTS, fr->fshift);
1068
1069     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1070     {
1071         wallcycle_start(wcycle, ewcPPDURINGPME);
1072         dd_force_flop_start(cr->dd, nrnb);
1073     }
1074
1075     if (inputrec->bRot)
1076     {
1077         wallcycle_start(wcycle, ewcROT);
1078         do_rotation(cr, inputrec, box, x, t, step, bNS);
1079         wallcycle_stop(wcycle, ewcROT);
1080     }
1081
1082     /* Temporary solution until all routines take PaddedRVecVector */
1083     rvec *f = as_rvec_array(force->data());
1084
1085     /* Start the force cycle counter.
1086      * Note that a different counter is used for dynamic load balancing.
1087      */
1088     wallcycle_start(wcycle, ewcFORCE);
1089     if (bDoForces)
1090     {
1091         /* Reset forces for which the virial is calculated separately:
1092          * PME/Ewald forces if necessary */
1093         if (fr->bF_NoVirSum)
1094         {
1095             if (flags & GMX_FORCE_VIRIAL)
1096             {
1097                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1098             }
1099             else
1100             {
1101                 /* We are not calculating the pressure so we do not need
1102                  * a separate array for forces that do not contribute
1103                  * to the pressure.
1104                  */
1105                 fr->f_novirsum = force;
1106             }
1107         }
1108
1109         if (fr->bF_NoVirSum)
1110         {
1111             if (flags & GMX_FORCE_VIRIAL)
1112             {
1113                 /* TODO: remove this - 1 when padding is properly implemented */
1114                 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1115                                 as_rvec_array(fr->f_novirsum->data()));
1116             }
1117         }
1118         /* Clear the short- and long-range forces */
1119         clear_rvecs_omp(fr->natoms_force_constr, f);
1120
1121         clear_rvec(fr->vir_diag_posres);
1122     }
1123
1124     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1125     {
1126         clear_pull_forces(inputrec->pull_work);
1127     }
1128
1129     /* We calculate the non-bonded forces, when done on the CPU, here.
1130      * We do this before calling do_force_lowlevel, because in that
1131      * function, the listed forces are calculated before PME, which
1132      * does communication.  With this order, non-bonded and listed
1133      * force calculation imbalance can be balanced out by the domain
1134      * decomposition load balancing.
1135      */
1136
1137     if (!bUseOrEmulGPU)
1138     {
1139         /* Maybe we should move this into do_force_lowlevel */
1140         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1141                      nrnb, wcycle);
1142     }
1143
1144     if (fr->efep != efepNO)
1145     {
1146         /* Calculate the local and non-local free energy interactions here.
1147          * Happens here on the CPU both with and without GPU.
1148          */
1149         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1150         {
1151             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1152                              fr, x, f, mdatoms,
1153                              inputrec->fepvals, lambda,
1154                              enerd, flags, nrnb, wcycle);
1155         }
1156
1157         if (DOMAINDECOMP(cr) &&
1158             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1159         {
1160             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1161                              fr, x, f, mdatoms,
1162                              inputrec->fepvals, lambda,
1163                              enerd, flags, nrnb, wcycle);
1164         }
1165     }
1166
1167     if (!bUseOrEmulGPU || bDiffKernels)
1168     {
1169         int aloc;
1170
1171         if (DOMAINDECOMP(cr))
1172         {
1173             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1174                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1175                          nrnb, wcycle);
1176         }
1177
1178         if (!bUseOrEmulGPU)
1179         {
1180             aloc = eintLocal;
1181         }
1182         else
1183         {
1184             aloc = eintNonlocal;
1185         }
1186
1187         /* Add all the non-bonded force to the normal force array.
1188          * This can be split into a local and a non-local part when overlapping
1189          * communication with calculation with domain decomposition.
1190          */
1191         wallcycle_stop(wcycle, ewcFORCE);
1192         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1193         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1194         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1195         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1196         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1197         wallcycle_start_nocount(wcycle, ewcFORCE);
1198
1199         /* if there are multiple fshift output buffers reduce them */
1200         if ((flags & GMX_FORCE_VIRIAL) &&
1201             nbv->grp[aloc].nbl_lists.nnbl > 1)
1202         {
1203             /* This is not in a subcounter because it takes a
1204                negligible and constant-sized amount of time */
1205             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1206                                                      fr->fshift);
1207         }
1208     }
1209
1210     /* update QMMMrec, if necessary */
1211     if (fr->bQMMM)
1212     {
1213         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1214     }
1215
1216     /* Compute the bonded and non-bonded energies and optionally forces */
1217     do_force_lowlevel(fr, inputrec, &(top->idef),
1218                       cr, nrnb, wcycle, mdatoms,
1219                       x, hist, f, enerd, fcd, top, fr->born,
1220                       bBornRadii, box,
1221                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1222                       flags, &cycles_pme);
1223
1224     wallcycle_stop(wcycle, ewcFORCE);
1225
1226     if (ed)
1227     {
1228         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1229     }
1230
1231     if (bUseOrEmulGPU && !bDiffKernels)
1232     {
1233         /* wait for non-local forces (or calculate in emulation mode) */
1234         if (DOMAINDECOMP(cr))
1235         {
1236             if (bUseGPU)
1237             {
1238                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1239                 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1240                                        flags, eatNonlocal,
1241                                        enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1242                                        fr->fshift);
1243                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1244             }
1245             else
1246             {
1247                 wallcycle_start_nocount(wcycle, ewcFORCE);
1248                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1249                              nrnb, wcycle);
1250                 wallcycle_stop(wcycle, ewcFORCE);
1251             }
1252             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1253             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1254             /* skip the reduction if there was no non-local work to do */
1255             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1256             {
1257                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1258                                                nbv->grp[eintNonlocal].nbat, f);
1259             }
1260             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1261             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1262         }
1263     }
1264
1265     if (DOMAINDECOMP(cr))
1266     {
1267         /* We are done with the CPU compute.
1268          * We will now communicate the non-local forces.
1269          * If we use a GPU this will overlap with GPU work, so in that case
1270          * we do not close the DD force balancing region here.
1271          */
1272         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1273         {
1274             ddCloseBalanceRegionCpu(cr->dd);
1275         }
1276         if (bDoForces)
1277         {
1278             wallcycle_start(wcycle, ewcMOVEF);
1279             dd_move_f(cr->dd, f, fr->fshift);
1280             wallcycle_stop(wcycle, ewcMOVEF);
1281         }
1282     }
1283
1284     if (bUseOrEmulGPU)
1285     {
1286         /* wait for local forces (or calculate in emulation mode) */
1287         if (bUseGPU)
1288         {
1289             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1290              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1291              * but even with a step of 0.1 ms the difference is less than 1%
1292              * of the step time.
1293              */
1294             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1295
1296             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1297             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1298                                    flags, eatLocal,
1299                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1300                                    fr->fshift);
1301             float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1302
1303             if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1304             {
1305                 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1306                 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1307                 {
1308                     /* We measured few cycles, it could be that the kernel
1309                      * and transfer finished earlier and there was no actual
1310                      * wait time, only API call overhead.
1311                      * Then the actual time could be anywhere between 0 and
1312                      * cycles_wait_est. We will use half of cycles_wait_est.
1313                      */
1314                     waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1315                 }
1316                 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1317             }
1318
1319             /* now clear the GPU outputs while we finish the step on the CPU */
1320             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1321             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1322             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1323         }
1324         else
1325         {
1326             wallcycle_start_nocount(wcycle, ewcFORCE);
1327             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1328                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1329                          nrnb, wcycle);
1330             wallcycle_stop(wcycle, ewcFORCE);
1331         }
1332         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1333         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1334         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1335                                        nbv->grp[eintLocal].nbat, f);
1336         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1337         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1338     }
1339
1340     if (DOMAINDECOMP(cr))
1341     {
1342         dd_force_flop_stop(cr->dd, nrnb);
1343     }
1344
1345     if (bDoForces)
1346     {
1347         /* Collect forces from modules */
1348         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1349         if (fr->bF_NoVirSum)
1350         {
1351             fNoVirSum = *fr->f_novirsum;
1352         }
1353         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1354
1355         /* If we have NoVirSum forces, but we do not calculate the virial,
1356          * we sum fr->f_novirsum=f later.
1357          */
1358         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1359         {
1360             wallcycle_start(wcycle, ewcVSITESPREAD);
1361             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1362                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1363             wallcycle_stop(wcycle, ewcVSITESPREAD);
1364         }
1365
1366         if (flags & GMX_FORCE_VIRIAL)
1367         {
1368             /* Calculation of the virial must be done after vsites! */
1369             calc_virial(0, mdatoms->homenr, x, f,
1370                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1371         }
1372     }
1373
1374     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1375     {
1376         /* Since the COM pulling is always done mass-weighted, no forces are
1377          * applied to vsites and this call can be done after vsite spreading.
1378          */
1379         pull_potential_wrapper(cr, inputrec, box, x,
1380                                f, vir_force, mdatoms, enerd, lambda, t,
1381                                wcycle);
1382     }
1383
1384     /* Add the forces from enforced rotation potentials (if any) */
1385     if (inputrec->bRot)
1386     {
1387         wallcycle_start(wcycle, ewcROTadd);
1388         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1389         wallcycle_stop(wcycle, ewcROTadd);
1390     }
1391
1392     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1393     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1394
1395     if (PAR(cr) && !(cr->duty & DUTY_PME))
1396     {
1397         /* In case of node-splitting, the PP nodes receive the long-range
1398          * forces, virial and energy from the PME nodes here.
1399          */
1400         pme_receive_force_ener(cr, wcycle, enerd, fr);
1401     }
1402
1403     if (bDoForces)
1404     {
1405         post_process_forces(cr, step, nrnb, wcycle,
1406                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1407                             flags);
1408     }
1409
1410     if (flags & GMX_FORCE_ENERGY)
1411     {
1412         /* Sum the potential energy terms from group contributions */
1413         sum_epot(&(enerd->grpp), enerd->term);
1414
1415         if (!EI_TPI(inputrec->eI))
1416         {
1417             checkPotentialEnergyValidity(enerd);
1418         }
1419     }
1420 }
1421
1422 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1423                         t_inputrec *inputrec,
1424                         gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1425                         gmx_localtop_t *top,
1426                         gmx_groups_t *groups,
1427                         matrix box, rvec x[], history_t *hist,
1428                         PaddedRVecVector *force,
1429                         tensor vir_force,
1430                         t_mdatoms *mdatoms,
1431                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1432                         real *lambda, t_graph *graph,
1433                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1434                         double t, gmx_edsam_t ed,
1435                         gmx_bool bBornRadii,
1436                         int flags,
1437                         DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1438                         DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1439 {
1440     int        cg0, cg1, i, j;
1441     double     mu[2*DIM];
1442     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1443     gmx_bool   bDoForces;
1444     float      cycles_pme;
1445
1446     const int  start  = 0;
1447     const int  homenr = mdatoms->homenr;
1448
1449     clear_mat(vir_force);
1450
1451     cg0 = 0;
1452     if (DOMAINDECOMP(cr))
1453     {
1454         cg1 = cr->dd->ncg_tot;
1455     }
1456     else
1457     {
1458         cg1 = top->cgs.nr;
1459     }
1460     if (fr->n_tpi > 0)
1461     {
1462         cg1--;
1463     }
1464
1465     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1466     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1467     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1468     bFillGrid      = (bNS && bStateChanged);
1469     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1470     bDoForces      = (flags & GMX_FORCE_FORCES);
1471
1472     if (bStateChanged)
1473     {
1474         update_forcerec(fr, box);
1475
1476         if (inputrecNeedMutot(inputrec))
1477         {
1478             /* Calculate total (local) dipole moment in a temporary common array.
1479              * This makes it possible to sum them over nodes faster.
1480              */
1481             calc_mu(start, homenr,
1482                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1483                     mu, mu+DIM);
1484         }
1485     }
1486
1487     if (fr->ePBC != epbcNONE)
1488     {
1489         /* Compute shift vectors every step,
1490          * because of pressure coupling or box deformation!
1491          */
1492         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1493         {
1494             calc_shifts(box, fr->shift_vec);
1495         }
1496
1497         if (bCalcCGCM)
1498         {
1499             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1500                                      &(top->cgs), x, fr->cg_cm);
1501             inc_nrnb(nrnb, eNR_CGCM, homenr);
1502             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1503         }
1504         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1505         {
1506             unshift_self(graph, box, x);
1507         }
1508     }
1509     else if (bCalcCGCM)
1510     {
1511         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1512         inc_nrnb(nrnb, eNR_CGCM, homenr);
1513     }
1514
1515     if (bCalcCGCM && gmx_debug_at)
1516     {
1517         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1518     }
1519
1520 #if GMX_MPI
1521     if (!(cr->duty & DUTY_PME))
1522     {
1523         gmx_bool bBS;
1524         matrix   boxs;
1525
1526         /* Send particle coordinates to the pme nodes.
1527          * Since this is only implemented for domain decomposition
1528          * and domain decomposition does not use the graph,
1529          * we do not need to worry about shifting.
1530          */
1531
1532         wallcycle_start(wcycle, ewcPP_PMESENDX);
1533
1534         bBS = (inputrec->nwall == 2);
1535         if (bBS)
1536         {
1537             copy_mat(box, boxs);
1538             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1539         }
1540
1541         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1542                                  lambda[efptCOUL], lambda[efptVDW],
1543                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1544                                  step);
1545
1546         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1547     }
1548 #endif /* GMX_MPI */
1549
1550     /* Communicate coordinates and sum dipole if necessary */
1551     if (DOMAINDECOMP(cr))
1552     {
1553         wallcycle_start(wcycle, ewcMOVEX);
1554         dd_move_x(cr->dd, box, x);
1555         wallcycle_stop(wcycle, ewcMOVEX);
1556         /* No GPU support, no move_x overlap, so reopen the balance region here */
1557         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1558         {
1559             ddReopenBalanceRegionCpu(cr->dd);
1560         }
1561     }
1562
1563     if (inputrecNeedMutot(inputrec))
1564     {
1565         if (bStateChanged)
1566         {
1567             if (PAR(cr))
1568             {
1569                 gmx_sumd(2*DIM, mu, cr);
1570                 ddReopenBalanceRegionCpu(cr->dd);
1571             }
1572             for (i = 0; i < 2; i++)
1573             {
1574                 for (j = 0; j < DIM; j++)
1575                 {
1576                     fr->mu_tot[i][j] = mu[i*DIM + j];
1577                 }
1578             }
1579         }
1580         if (fr->efep == efepNO)
1581         {
1582             copy_rvec(fr->mu_tot[0], mu_tot);
1583         }
1584         else
1585         {
1586             for (j = 0; j < DIM; j++)
1587             {
1588                 mu_tot[j] =
1589                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1590             }
1591         }
1592     }
1593
1594     /* Reset energies */
1595     reset_enerdata(enerd);
1596     clear_rvecs(SHIFTS, fr->fshift);
1597
1598     if (bNS)
1599     {
1600         wallcycle_start(wcycle, ewcNS);
1601
1602         if (graph && bStateChanged)
1603         {
1604             /* Calculate intramolecular shift vectors to make molecules whole */
1605             mk_mshift(fplog, graph, fr->ePBC, box, x);
1606         }
1607
1608         /* Do the actual neighbour searching */
1609         ns(fplog, fr, box,
1610            groups, top, mdatoms,
1611            cr, nrnb, bFillGrid);
1612
1613         wallcycle_stop(wcycle, ewcNS);
1614     }
1615
1616     if (inputrec->implicit_solvent && bNS)
1617     {
1618         make_gb_nblist(cr, inputrec->gb_algorithm,
1619                        x, box, fr, &top->idef, graph, fr->born);
1620     }
1621
1622     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1623     {
1624         wallcycle_start(wcycle, ewcPPDURINGPME);
1625         dd_force_flop_start(cr->dd, nrnb);
1626     }
1627
1628     if (inputrec->bRot)
1629     {
1630         wallcycle_start(wcycle, ewcROT);
1631         do_rotation(cr, inputrec, box, x, t, step, bNS);
1632         wallcycle_stop(wcycle, ewcROT);
1633     }
1634
1635     /* Temporary solution until all routines take PaddedRVecVector */
1636     rvec *f = as_rvec_array(force->data());
1637
1638     /* Start the force cycle counter.
1639      * Note that a different counter is used for dynamic load balancing.
1640      */
1641     wallcycle_start(wcycle, ewcFORCE);
1642
1643     if (bDoForces)
1644     {
1645         /* Reset forces for which the virial is calculated separately:
1646          * PME/Ewald forces if necessary */
1647         if (fr->bF_NoVirSum)
1648         {
1649             if (flags & GMX_FORCE_VIRIAL)
1650             {
1651                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1652                 /* TODO: remove this - 1 when padding is properly implemented */
1653                 clear_rvecs(fr->f_novirsum->size() - 1,
1654                             as_rvec_array(fr->f_novirsum->data()));
1655             }
1656             else
1657             {
1658                 /* We are not calculating the pressure so we do not need
1659                  * a separate array for forces that do not contribute
1660                  * to the pressure.
1661                  */
1662                 fr->f_novirsum = force;
1663             }
1664         }
1665
1666         /* Clear the short- and long-range forces */
1667         clear_rvecs(fr->natoms_force_constr, f);
1668
1669         clear_rvec(fr->vir_diag_posres);
1670     }
1671     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1672     {
1673         clear_pull_forces(inputrec->pull_work);
1674     }
1675
1676     /* update QMMMrec, if necessary */
1677     if (fr->bQMMM)
1678     {
1679         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1680     }
1681
1682     /* Compute the bonded and non-bonded energies and optionally forces */
1683     do_force_lowlevel(fr, inputrec, &(top->idef),
1684                       cr, nrnb, wcycle, mdatoms,
1685                       x, hist, f, enerd, fcd, top, fr->born,
1686                       bBornRadii, box,
1687                       inputrec->fepvals, lambda,
1688                       graph, &(top->excls), fr->mu_tot,
1689                       flags,
1690                       &cycles_pme);
1691
1692     wallcycle_stop(wcycle, ewcFORCE);
1693
1694     if (DOMAINDECOMP(cr))
1695     {
1696         dd_force_flop_stop(cr->dd, nrnb);
1697
1698         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1699         {
1700             ddCloseBalanceRegionCpu(cr->dd);
1701         }
1702     }
1703
1704     if (ed)
1705     {
1706         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1707     }
1708
1709     if (bDoForces)
1710     {
1711         /* Collect forces from modules */
1712         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1713         if (fr->bF_NoVirSum)
1714         {
1715             fNoVirSum = *fr->f_novirsum;
1716         }
1717         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1718
1719         /* Communicate the forces */
1720         if (DOMAINDECOMP(cr))
1721         {
1722             wallcycle_start(wcycle, ewcMOVEF);
1723             dd_move_f(cr->dd, f, fr->fshift);
1724             /* Do we need to communicate the separate force array
1725              * for terms that do not contribute to the single sum virial?
1726              * Position restraints and electric fields do not introduce
1727              * inter-cg forces, only full electrostatics methods do.
1728              * When we do not calculate the virial, fr->f_novirsum = f,
1729              * so we have already communicated these forces.
1730              */
1731             if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1732                 (flags & GMX_FORCE_VIRIAL))
1733             {
1734                 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1735             }
1736             wallcycle_stop(wcycle, ewcMOVEF);
1737         }
1738
1739         /* If we have NoVirSum forces, but we do not calculate the virial,
1740          * we sum fr->f_novirsum=f later.
1741          */
1742         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1743         {
1744             wallcycle_start(wcycle, ewcVSITESPREAD);
1745             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1746                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1747             wallcycle_stop(wcycle, ewcVSITESPREAD);
1748         }
1749
1750         if (flags & GMX_FORCE_VIRIAL)
1751         {
1752             /* Calculation of the virial must be done after vsites! */
1753             calc_virial(0, mdatoms->homenr, x, f,
1754                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1755         }
1756     }
1757
1758     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1759     {
1760         pull_potential_wrapper(cr, inputrec, box, x,
1761                                f, vir_force, mdatoms, enerd, lambda, t,
1762                                wcycle);
1763     }
1764
1765     /* Add the forces from enforced rotation potentials (if any) */
1766     if (inputrec->bRot)
1767     {
1768         wallcycle_start(wcycle, ewcROTadd);
1769         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1770         wallcycle_stop(wcycle, ewcROTadd);
1771     }
1772
1773     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1774     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1775
1776     if (PAR(cr) && !(cr->duty & DUTY_PME))
1777     {
1778         /* In case of node-splitting, the PP nodes receive the long-range
1779          * forces, virial and energy from the PME nodes here.
1780          */
1781         pme_receive_force_ener(cr, wcycle, enerd, fr);
1782     }
1783
1784     if (bDoForces)
1785     {
1786         post_process_forces(cr, step, nrnb, wcycle,
1787                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1788                             flags);
1789     }
1790
1791     if (flags & GMX_FORCE_ENERGY)
1792     {
1793         /* Sum the potential energy terms from group contributions */
1794         sum_epot(&(enerd->grpp), enerd->term);
1795
1796         if (!EI_TPI(inputrec->eI))
1797         {
1798             checkPotentialEnergyValidity(enerd);
1799         }
1800     }
1801
1802 }
1803
1804 void do_force(FILE *fplog, t_commrec *cr,
1805               t_inputrec *inputrec,
1806               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1807               gmx_localtop_t *top,
1808               gmx_groups_t *groups,
1809               matrix box, PaddedRVecVector *coordinates, history_t *hist,
1810               PaddedRVecVector *force,
1811               tensor vir_force,
1812               t_mdatoms *mdatoms,
1813               gmx_enerdata_t *enerd, t_fcdata *fcd,
1814               gmx::ArrayRef<real> lambda, t_graph *graph,
1815               t_forcerec *fr,
1816               gmx_vsite_t *vsite, rvec mu_tot,
1817               double t, gmx_edsam_t ed,
1818               gmx_bool bBornRadii,
1819               int flags,
1820               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1821               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1822 {
1823     /* modify force flag if not doing nonbonded */
1824     if (!fr->bNonbonded)
1825     {
1826         flags &= ~GMX_FORCE_NONBONDED;
1827     }
1828
1829     GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1830                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1831     GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1832                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1833
1834     rvec *x = as_rvec_array(coordinates->data());
1835
1836     switch (inputrec->cutoff_scheme)
1837     {
1838         case ecutsVERLET:
1839             do_force_cutsVERLET(fplog, cr, inputrec,
1840                                 step, nrnb, wcycle,
1841                                 top,
1842                                 groups,
1843                                 box, x, hist,
1844                                 force, vir_force,
1845                                 mdatoms,
1846                                 enerd, fcd,
1847                                 lambda.data(), graph,
1848                                 fr, fr->ic,
1849                                 vsite, mu_tot,
1850                                 t, ed,
1851                                 bBornRadii,
1852                                 flags,
1853                                 ddOpenBalanceRegion,
1854                                 ddCloseBalanceRegion);
1855             break;
1856         case ecutsGROUP:
1857             do_force_cutsGROUP(fplog, cr, inputrec,
1858                                step, nrnb, wcycle,
1859                                top,
1860                                groups,
1861                                box, x, hist,
1862                                force, vir_force,
1863                                mdatoms,
1864                                enerd, fcd,
1865                                lambda.data(), graph,
1866                                fr, vsite, mu_tot,
1867                                t, ed,
1868                                bBornRadii,
1869                                flags,
1870                                ddOpenBalanceRegion,
1871                                ddCloseBalanceRegion);
1872             break;
1873         default:
1874             gmx_incons("Invalid cut-off scheme passed!");
1875     }
1876
1877     /* In case we don't have constraints and are using GPUs, the next balancing
1878      * region starts here.
1879      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1880      * virial calculation and COM pulling, is not thus not included in
1881      * the balance timing, which is ok as most tasks do communication.
1882      */
1883     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1884     {
1885         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1886     }
1887 }
1888
1889
1890 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1891                         t_inputrec *ir, t_mdatoms *md,
1892                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1893                         t_forcerec *fr, gmx_localtop_t *top)
1894 {
1895     int             i, m, start, end;
1896     gmx_int64_t     step;
1897     real            dt = ir->delta_t;
1898     real            dvdl_dum;
1899     rvec           *savex;
1900
1901     /* We need to allocate one element extra, since we might use
1902      * (unaligned) 4-wide SIMD loads to access rvec entries.
1903      */
1904     snew(savex, state->natoms + 1);
1905
1906     start = 0;
1907     end   = md->homenr;
1908
1909     if (debug)
1910     {
1911         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1912                 start, md->homenr, end);
1913     }
1914     /* Do a first constrain to reset particles... */
1915     step = ir->init_step;
1916     if (fplog)
1917     {
1918         char buf[STEPSTRSIZE];
1919         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1920                 gmx_step_str(step, buf));
1921     }
1922     dvdl_dum = 0;
1923
1924     /* constrain the current position */
1925     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1926               ir, cr, step, 0, 1.0, md,
1927               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1928               fr->bMolPBC, state->box,
1929               state->lambda[efptBONDED], &dvdl_dum,
1930               nullptr, nullptr, nrnb, econqCoord);
1931     if (EI_VV(ir->eI))
1932     {
1933         /* constrain the inital velocity, and save it */
1934         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1935         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1936                   ir, cr, step, 0, 1.0, md,
1937                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1938                   fr->bMolPBC, state->box,
1939                   state->lambda[efptBONDED], &dvdl_dum,
1940                   nullptr, nullptr, nrnb, econqVeloc);
1941     }
1942     /* constrain the inital velocities at t-dt/2 */
1943     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1944     {
1945         for (i = start; (i < end); i++)
1946         {
1947             for (m = 0; (m < DIM); m++)
1948             {
1949                 /* Reverse the velocity */
1950                 state->v[i][m] = -state->v[i][m];
1951                 /* Store the position at t-dt in buf */
1952                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1953             }
1954         }
1955         /* Shake the positions at t=-dt with the positions at t=0
1956          * as reference coordinates.
1957          */
1958         if (fplog)
1959         {
1960             char buf[STEPSTRSIZE];
1961             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1962                     gmx_step_str(step, buf));
1963         }
1964         dvdl_dum = 0;
1965         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1966                   ir, cr, step, -1, 1.0, md,
1967                   as_rvec_array(state->x.data()), savex, nullptr,
1968                   fr->bMolPBC, state->box,
1969                   state->lambda[efptBONDED], &dvdl_dum,
1970                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
1971
1972         for (i = start; i < end; i++)
1973         {
1974             for (m = 0; m < DIM; m++)
1975             {
1976                 /* Re-reverse the velocities */
1977                 state->v[i][m] = -state->v[i][m];
1978             }
1979         }
1980     }
1981     sfree(savex);
1982 }
1983
1984
1985 static void
1986 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
1987                 double *enerout, double *virout)
1988 {
1989     double enersum, virsum;
1990     double invscale, invscale2, invscale3;
1991     double r, ea, eb, ec, pa, pb, pc, pd;
1992     double y0, f, g, h;
1993     int    ri, offset;
1994     double tabfactor;
1995
1996     invscale  = 1.0/scale;
1997     invscale2 = invscale*invscale;
1998     invscale3 = invscale*invscale2;
1999
2000     /* Following summation derived from cubic spline definition,
2001      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2002      * the cubic spline.  We first calculate the negative of the
2003      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2004      * add the more standard, abrupt cutoff correction to that result,
2005      * yielding the long-range correction for a switched function.  We
2006      * perform both the pressure and energy loops at the same time for
2007      * simplicity, as the computational cost is low. */
2008
2009     if (offstart == 0)
2010     {
2011         /* Since the dispersion table has been scaled down a factor
2012          * 6.0 and the repulsion a factor 12.0 to compensate for the
2013          * c6/c12 parameters inside nbfp[] being scaled up (to save
2014          * flops in kernels), we need to correct for this.
2015          */
2016         tabfactor = 6.0;
2017     }
2018     else
2019     {
2020         tabfactor = 12.0;
2021     }
2022
2023     enersum = 0.0;
2024     virsum  = 0.0;
2025     for (ri = rstart; ri < rend; ++ri)
2026     {
2027         r  = ri*invscale;
2028         ea = invscale3;
2029         eb = 2.0*invscale2*r;
2030         ec = invscale*r*r;
2031
2032         pa = invscale3;
2033         pb = 3.0*invscale2*r;
2034         pc = 3.0*invscale*r*r;
2035         pd = r*r*r;
2036
2037         /* this "8" is from the packing in the vdwtab array - perhaps
2038            should be defined? */
2039
2040         offset = 8*ri + offstart;
2041         y0     = vdwtab[offset];
2042         f      = vdwtab[offset+1];
2043         g      = vdwtab[offset+2];
2044         h      = vdwtab[offset+3];
2045
2046         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2047         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2048     }
2049     *enerout = 4.0*M_PI*enersum*tabfactor;
2050     *virout  = 4.0*M_PI*virsum*tabfactor;
2051 }
2052
2053 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2054 {
2055     double   eners[2], virs[2], enersum, virsum;
2056     double   r0, rc3, rc9;
2057     int      ri0, ri1, i;
2058     real     scale, *vdwtab;
2059
2060     fr->enershiftsix    = 0;
2061     fr->enershifttwelve = 0;
2062     fr->enerdiffsix     = 0;
2063     fr->enerdifftwelve  = 0;
2064     fr->virdiffsix      = 0;
2065     fr->virdifftwelve   = 0;
2066
2067     if (eDispCorr != edispcNO)
2068     {
2069         for (i = 0; i < 2; i++)
2070         {
2071             eners[i] = 0;
2072             virs[i]  = 0;
2073         }
2074         if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2075             (fr->vdw_modifier == eintmodPOTSWITCH) ||
2076             (fr->vdw_modifier == eintmodFORCESWITCH) ||
2077             (fr->vdwtype == evdwSHIFT) ||
2078             (fr->vdwtype == evdwSWITCH))
2079         {
2080             if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2081                  (fr->vdw_modifier == eintmodFORCESWITCH) ||
2082                  (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2083             {
2084                 gmx_fatal(FARGS,
2085                           "With dispersion correction rvdw-switch can not be zero "
2086                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2087             }
2088
2089             /* TODO This code depends on the logic in tables.c that
2090                constructs the table layout, which should be made
2091                explicit in future cleanup. */
2092             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2093                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2094             scale  = fr->dispersionCorrectionTable->scale;
2095             vdwtab = fr->dispersionCorrectionTable->data;
2096
2097             /* Round the cut-offs to exact table values for precision */
2098             ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
2099             ri1  = static_cast<int>(ceil(fr->rvdw*scale));
2100
2101             /* The code below has some support for handling force-switching, i.e.
2102              * when the force (instead of potential) is switched over a limited
2103              * region. This leads to a constant shift in the potential inside the
2104              * switching region, which we can handle by adding a constant energy
2105              * term in the force-switch case just like when we do potential-shift.
2106              *
2107              * For now this is not enabled, but to keep the functionality in the
2108              * code we check separately for switch and shift. When we do force-switch
2109              * the shifting point is rvdw_switch, while it is the cutoff when we
2110              * have a classical potential-shift.
2111              *
2112              * For a pure potential-shift the potential has a constant shift
2113              * all the way out to the cutoff, and that is it. For other forms
2114              * we need to calculate the constant shift up to the point where we
2115              * start modifying the potential.
2116              */
2117             ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2118
2119             r0   = ri0/scale;
2120             rc3  = r0*r0*r0;
2121             rc9  = rc3*rc3*rc3;
2122
2123             if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2124                 (fr->vdwtype == evdwSHIFT))
2125             {
2126                 /* Determine the constant energy shift below rvdw_switch.
2127                  * Table has a scale factor since we have scaled it down to compensate
2128                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2129                  */
2130                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2131                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2132             }
2133             else if (fr->vdw_modifier == eintmodPOTSHIFT)
2134             {
2135                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2136                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2137             }
2138
2139             /* Add the constant part from 0 to rvdw_switch.
2140              * This integration from 0 to rvdw_switch overcounts the number
2141              * of interactions by 1, as it also counts the self interaction.
2142              * We will correct for this later.
2143              */
2144             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2145             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2146
2147             /* Calculate the contribution in the range [r0,r1] where we
2148              * modify the potential. For a pure potential-shift modifier we will
2149              * have ri0==ri1, and there will not be any contribution here.
2150              */
2151             for (i = 0; i < 2; i++)
2152             {
2153                 enersum = 0;
2154                 virsum  = 0;
2155                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2156                 eners[i] -= enersum;
2157                 virs[i]  -= virsum;
2158             }
2159
2160             /* Alright: Above we compensated by REMOVING the parts outside r0
2161              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2162              *
2163              * Regardless of whether r0 is the point where we start switching,
2164              * or the cutoff where we calculated the constant shift, we include
2165              * all the parts we are missing out to infinity from r0 by
2166              * calculating the analytical dispersion correction.
2167              */
2168             eners[0] += -4.0*M_PI/(3.0*rc3);
2169             eners[1] +=  4.0*M_PI/(9.0*rc9);
2170             virs[0]  +=  8.0*M_PI/rc3;
2171             virs[1]  += -16.0*M_PI/(3.0*rc9);
2172         }
2173         else if (fr->vdwtype == evdwCUT ||
2174                  EVDW_PME(fr->vdwtype) ||
2175                  fr->vdwtype == evdwUSER)
2176         {
2177             if (fr->vdwtype == evdwUSER && fplog)
2178             {
2179                 fprintf(fplog,
2180                         "WARNING: using dispersion correction with user tables\n");
2181             }
2182
2183             /* Note that with LJ-PME, the dispersion correction is multiplied
2184              * by the difference between the actual C6 and the value of C6
2185              * that would produce the combination rule.
2186              * This means the normal energy and virial difference formulas
2187              * can be used here.
2188              */
2189
2190             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2191             rc9  = rc3*rc3*rc3;
2192             /* Contribution beyond the cut-off */
2193             eners[0] += -4.0*M_PI/(3.0*rc3);
2194             eners[1] +=  4.0*M_PI/(9.0*rc9);
2195             if (fr->vdw_modifier == eintmodPOTSHIFT)
2196             {
2197                 /* Contribution within the cut-off */
2198                 eners[0] += -4.0*M_PI/(3.0*rc3);
2199                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2200             }
2201             /* Contribution beyond the cut-off */
2202             virs[0]  +=  8.0*M_PI/rc3;
2203             virs[1]  += -16.0*M_PI/(3.0*rc9);
2204         }
2205         else
2206         {
2207             gmx_fatal(FARGS,
2208                       "Dispersion correction is not implemented for vdw-type = %s",
2209                       evdw_names[fr->vdwtype]);
2210         }
2211
2212         /* When we deprecate the group kernels the code below can go too */
2213         if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2214         {
2215             /* Calculate self-interaction coefficient (assuming that
2216              * the reciprocal-space contribution is constant in the
2217              * region that contributes to the self-interaction).
2218              */
2219             fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2220
2221             eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2222             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2223         }
2224
2225         fr->enerdiffsix    = eners[0];
2226         fr->enerdifftwelve = eners[1];
2227         /* The 0.5 is due to the Gromacs definition of the virial */
2228         fr->virdiffsix     = 0.5*virs[0];
2229         fr->virdifftwelve  = 0.5*virs[1];
2230     }
2231 }
2232
2233 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2234                    matrix box, real lambda, tensor pres, tensor virial,
2235                    real *prescorr, real *enercorr, real *dvdlcorr)
2236 {
2237     gmx_bool bCorrAll, bCorrPres;
2238     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2239     int      m;
2240
2241     *prescorr = 0;
2242     *enercorr = 0;
2243     *dvdlcorr = 0;
2244
2245     clear_mat(virial);
2246     clear_mat(pres);
2247
2248     if (ir->eDispCorr != edispcNO)
2249     {
2250         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2251                      ir->eDispCorr == edispcAllEnerPres);
2252         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2253                      ir->eDispCorr == edispcAllEnerPres);
2254
2255         invvol = 1/det(box);
2256         if (fr->n_tpi)
2257         {
2258             /* Only correct for the interactions with the inserted molecule */
2259             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2260             ninter = fr->n_tpi;
2261         }
2262         else
2263         {
2264             dens   = fr->numAtomsForDispersionCorrection*invvol;
2265             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2266         }
2267
2268         if (ir->efep == efepNO)
2269         {
2270             avcsix    = fr->avcsix[0];
2271             avctwelve = fr->avctwelve[0];
2272         }
2273         else
2274         {
2275             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2276             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2277         }
2278
2279         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2280         *enercorr += avcsix*enerdiff;
2281         dvdlambda  = 0.0;
2282         if (ir->efep != efepNO)
2283         {
2284             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2285         }
2286         if (bCorrAll)
2287         {
2288             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2289             *enercorr += avctwelve*enerdiff;
2290             if (fr->efep != efepNO)
2291             {
2292                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2293             }
2294         }
2295
2296         if (bCorrPres)
2297         {
2298             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2299             if (ir->eDispCorr == edispcAllEnerPres)
2300             {
2301                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2302             }
2303             /* The factor 2 is because of the Gromacs virial definition */
2304             spres = -2.0*invvol*svir*PRESFAC;
2305
2306             for (m = 0; m < DIM; m++)
2307             {
2308                 virial[m][m] += svir;
2309                 pres[m][m]   += spres;
2310             }
2311             *prescorr += spres;
2312         }
2313
2314         /* Can't currently control when it prints, for now, just print when degugging */
2315         if (debug)
2316         {
2317             if (bCorrAll)
2318             {
2319                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2320                         avcsix, avctwelve);
2321             }
2322             if (bCorrPres)
2323             {
2324                 fprintf(debug,
2325                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2326                         *enercorr, spres, svir);
2327             }
2328             else
2329             {
2330                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2331             }
2332         }
2333
2334         if (fr->efep != efepNO)
2335         {
2336             *dvdlcorr += dvdlambda;
2337         }
2338     }
2339 }
2340
2341 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2342                             const gmx_mtop_t *mtop, rvec x[],
2343                             gmx_bool bFirst)
2344 {
2345     t_graph        *graph;
2346     int             mb, as, mol;
2347     gmx_molblock_t *molb;
2348
2349     if (bFirst && fplog)
2350     {
2351         fprintf(fplog, "Removing pbc first time\n");
2352     }
2353
2354     snew(graph, 1);
2355     as = 0;
2356     for (mb = 0; mb < mtop->nmolblock; mb++)
2357     {
2358         molb = &mtop->molblock[mb];
2359         if (molb->natoms_mol == 1 ||
2360             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2361         {
2362             /* Just one atom or charge group in the molecule, no PBC required */
2363             as += molb->nmol*molb->natoms_mol;
2364         }
2365         else
2366         {
2367             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2368             mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2369                            0, molb->natoms_mol, FALSE, FALSE, graph);
2370
2371             for (mol = 0; mol < molb->nmol; mol++)
2372             {
2373                 mk_mshift(fplog, graph, ePBC, box, x+as);
2374
2375                 shift_self(graph, box, x+as);
2376                 /* The molecule is whole now.
2377                  * We don't need the second mk_mshift call as in do_pbc_first,
2378                  * since we no longer need this graph.
2379                  */
2380
2381                 as += molb->natoms_mol;
2382             }
2383             done_graph(graph);
2384         }
2385     }
2386     sfree(graph);
2387 }
2388
2389 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2390                        const gmx_mtop_t *mtop, rvec x[])
2391 {
2392     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2393 }
2394
2395 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2396                  gmx_mtop_t *mtop, rvec x[])
2397 {
2398     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2399 }
2400
2401 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2402 {
2403     int t, nth;
2404     nth = gmx_omp_nthreads_get(emntDefault);
2405
2406 #pragma omp parallel for num_threads(nth) schedule(static)
2407     for (t = 0; t < nth; t++)
2408     {
2409         try
2410         {
2411             int offset, len;
2412
2413             offset = (natoms*t    )/nth;
2414             len    = (natoms*(t + 1))/nth - offset;
2415             put_atoms_in_box(ePBC, box, len, x + offset);
2416         }
2417         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2418     }
2419 }
2420
2421 // TODO This can be cleaned up a lot, and move back to runner.cpp
2422 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2423                 const t_inputrec *inputrec,
2424                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2425                 gmx_walltime_accounting_t walltime_accounting,
2426                 nonbonded_verlet_t *nbv,
2427                 gmx_bool bWriteStat)
2428 {
2429     t_nrnb *nrnb_tot = nullptr;
2430     double  delta_t  = 0;
2431     double  nbfs     = 0, mflop = 0;
2432     double  elapsed_time,
2433             elapsed_time_over_all_ranks,
2434             elapsed_time_over_all_threads,
2435             elapsed_time_over_all_threads_over_all_ranks;
2436     /* Control whether it is valid to print a report. Only the
2437        simulation master may print, but it should not do so if the run
2438        terminated e.g. before a scheduled reset step. This is
2439        complicated by the fact that PME ranks are unaware of the
2440        reason why they were sent a pmerecvqxFINISH. To avoid
2441        communication deadlocks, we always do the communication for the
2442        report, even if we've decided not to write the report, because
2443        how long it takes to finish the run is not important when we've
2444        decided not to report on the simulation performance. */
2445     bool    printReport = SIMMASTER(cr);
2446
2447     if (!walltime_accounting_get_valid_finish(walltime_accounting))
2448     {
2449         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2450         printReport = false;
2451     }
2452
2453     if (cr->nnodes > 1)
2454     {
2455         snew(nrnb_tot, 1);
2456 #if GMX_MPI
2457         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2458                       cr->mpi_comm_mysim);
2459 #endif
2460     }
2461     else
2462     {
2463         nrnb_tot = nrnb;
2464     }
2465
2466     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2467     elapsed_time_over_all_ranks                  = elapsed_time;
2468     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2469     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2470 #if GMX_MPI
2471     if (cr->nnodes > 1)
2472     {
2473         /* reduce elapsed_time over all MPI ranks in the current simulation */
2474         MPI_Allreduce(&elapsed_time,
2475                       &elapsed_time_over_all_ranks,
2476                       1, MPI_DOUBLE, MPI_SUM,
2477                       cr->mpi_comm_mysim);
2478         elapsed_time_over_all_ranks /= cr->nnodes;
2479         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2480          * current simulation. */
2481         MPI_Allreduce(&elapsed_time_over_all_threads,
2482                       &elapsed_time_over_all_threads_over_all_ranks,
2483                       1, MPI_DOUBLE, MPI_SUM,
2484                       cr->mpi_comm_mysim);
2485     }
2486 #endif
2487
2488     if (printReport)
2489     {
2490         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2491     }
2492     if (cr->nnodes > 1)
2493     {
2494         sfree(nrnb_tot);
2495     }
2496
2497     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2498     {
2499         print_dd_statistics(cr, inputrec, fplog);
2500     }
2501
2502     /* TODO Move the responsibility for any scaling by thread counts
2503      * to the code that handled the thread region, so that there's a
2504      * mechanism to keep cycle counting working during the transition
2505      * to task parallelism. */
2506     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2507     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2508     wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2509     auto cycle_sum(wallcycle_sum(cr, wcycle));
2510
2511     if (printReport)
2512     {
2513         struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2514
2515         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2516                         elapsed_time_over_all_ranks,
2517                         wcycle, cycle_sum, gputimes);
2518
2519         if (EI_DYNAMICS(inputrec->eI))
2520         {
2521             delta_t = inputrec->delta_t;
2522         }
2523
2524         if (fplog)
2525         {
2526             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2527                        elapsed_time_over_all_ranks,
2528                        walltime_accounting_get_nsteps_done(walltime_accounting),
2529                        delta_t, nbfs, mflop);
2530         }
2531         if (bWriteStat)
2532         {
2533             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2534                        elapsed_time_over_all_ranks,
2535                        walltime_accounting_get_nsteps_done(walltime_accounting),
2536                        delta_t, nbfs, mflop);
2537         }
2538     }
2539 }
2540
2541 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2542 {
2543     /* this function works, but could probably use a logic rewrite to keep all the different
2544        types of efep straight. */
2545
2546     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2547     {
2548         return;
2549     }
2550
2551     t_lambda *fep = ir->fepvals;
2552     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2553                                             if checkpoint is set -- a kludge is in for now
2554                                             to prevent this.*/
2555
2556     for (int i = 0; i < efptNR; i++)
2557     {
2558         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2559         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2560         {
2561             lambda[i] = fep->init_lambda;
2562             if (lam0)
2563             {
2564                 lam0[i] = lambda[i];
2565             }
2566         }
2567         else
2568         {
2569             lambda[i] = fep->all_lambda[i][*fep_state];
2570             if (lam0)
2571             {
2572                 lam0[i] = lambda[i];
2573             }
2574         }
2575     }
2576     if (ir->bSimTemp)
2577     {
2578         /* need to rescale control temperatures to match current state */
2579         for (int i = 0; i < ir->opts.ngtc; i++)
2580         {
2581             if (ir->opts.ref_t[i] > 0)
2582             {
2583                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2584             }
2585         }
2586     }
2587
2588     /* Send to the log the information on the current lambdas */
2589     if (fplog != nullptr)
2590     {
2591         fprintf(fplog, "Initial vector of lambda components:[ ");
2592         for (const auto &l : lambda)
2593         {
2594             fprintf(fplog, "%10.4f ", l);
2595         }
2596         fprintf(fplog, "]\n");
2597     }
2598     return;
2599 }
2600
2601
2602 void init_md(FILE *fplog,
2603              t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2604              t_inputrec *ir, const gmx_output_env_t *oenv,
2605              double *t, double *t0,
2606              gmx::ArrayRef<real> lambda, int *fep_state, double *lam0,
2607              t_nrnb *nrnb, gmx_mtop_t *mtop,
2608              gmx_update_t **upd,
2609              int nfile, const t_filenm fnm[],
2610              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2611              tensor force_vir, tensor shake_vir, rvec mu_tot,
2612              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2613              gmx_wallcycle_t wcycle)
2614 {
2615     int  i;
2616
2617     /* Initial values */
2618     *t = *t0       = ir->init_t;
2619
2620     *bSimAnn = FALSE;
2621     for (i = 0; i < ir->opts.ngtc; i++)
2622     {
2623         /* set bSimAnn if any group is being annealed */
2624         if (ir->opts.annealing[i] != eannNO)
2625         {
2626             *bSimAnn = TRUE;
2627         }
2628     }
2629
2630     /* Initialize lambda variables */
2631     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2632
2633     // TODO upd is never NULL in practice, but the analysers don't know that
2634     if (upd)
2635     {
2636         *upd = init_update(ir);
2637     }
2638     if (*bSimAnn)
2639     {
2640         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2641     }
2642
2643     if (vcm != nullptr)
2644     {
2645         *vcm = init_vcm(fplog, &mtop->groups, ir);
2646     }
2647
2648     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2649     {
2650         if (ir->etc == etcBERENDSEN)
2651         {
2652             please_cite(fplog, "Berendsen84a");
2653         }
2654         if (ir->etc == etcVRESCALE)
2655         {
2656             please_cite(fplog, "Bussi2007a");
2657         }
2658         if (ir->eI == eiSD1)
2659         {
2660             please_cite(fplog, "Goga2012");
2661         }
2662     }
2663     init_nrnb(nrnb);
2664
2665     if (nfile != -1)
2666     {
2667         *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, outputProvider, ir, mtop, oenv, wcycle);
2668
2669         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? nullptr : mdoutf_get_fp_ene(*outf),
2670                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2671     }
2672
2673     /* Initiate variables */
2674     clear_mat(force_vir);
2675     clear_mat(shake_vir);
2676     clear_rvec(mu_tot);
2677 }