Bumped version to prepare for next patch release
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "sim_util.h"
40
41 #include "config.h"
42
43 #include <assert.h>
44 #include <math.h>
45 #include <stdio.h>
46 #include <string.h>
47
48 #include <array>
49
50 #include "gromacs/domdec/domdec.h"
51 #include "gromacs/domdec/domdec_struct.h"
52 #include "gromacs/essentialdynamics/edsam.h"
53 #include "gromacs/ewald/pme.h"
54 #include "gromacs/gmxlib/chargegroup.h"
55 #include "gromacs/gmxlib/md_logging.h"
56 #include "gromacs/gmxlib/network.h"
57 #include "gromacs/gmxlib/nrnb.h"
58 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
59 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
60 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
61 #include "gromacs/imd/imd.h"
62 #include "gromacs/listed-forces/bonded.h"
63 #include "gromacs/listed-forces/disre.h"
64 #include "gromacs/listed-forces/orires.h"
65 #include "gromacs/math/functions.h"
66 #include "gromacs/math/units.h"
67 #include "gromacs/math/vec.h"
68 #include "gromacs/math/vecdump.h"
69 #include "gromacs/mdlib/calcmu.h"
70 #include "gromacs/mdlib/constr.h"
71 #include "gromacs/mdlib/force.h"
72 #include "gromacs/mdlib/forcerec.h"
73 #include "gromacs/mdlib/genborn.h"
74 #include "gromacs/mdlib/gmx_omp_nthreads.h"
75 #include "gromacs/mdlib/mdrun.h"
76 #include "gromacs/mdlib/nb_verlet.h"
77 #include "gromacs/mdlib/nbnxn_atomdata.h"
78 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
79 #include "gromacs/mdlib/nbnxn_grid.h"
80 #include "gromacs/mdlib/nbnxn_search.h"
81 #include "gromacs/mdlib/qmmm.h"
82 #include "gromacs/mdlib/update.h"
83 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
84 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
85 #include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
86 #include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/pbcutil/ishift.h"
91 #include "gromacs/pbcutil/mshift.h"
92 #include "gromacs/pbcutil/pbc.h"
93 #include "gromacs/pulling/pull.h"
94 #include "gromacs/pulling/pull_rotation.h"
95 #include "gromacs/timing/cyclecounter.h"
96 #include "gromacs/timing/gpu_timing.h"
97 #include "gromacs/timing/wallcycle.h"
98 #include "gromacs/timing/wallcyclereporting.h"
99 #include "gromacs/timing/walltime_accounting.h"
100 #include "gromacs/utility/basedefinitions.h"
101 #include "gromacs/utility/cstringutil.h"
102 #include "gromacs/utility/exceptions.h"
103 #include "gromacs/utility/fatalerror.h"
104 #include "gromacs/utility/gmxassert.h"
105 #include "gromacs/utility/gmxmpi.h"
106 #include "gromacs/utility/pleasecite.h"
107 #include "gromacs/utility/smalloc.h"
108 #include "gromacs/utility/sysinfo.h"
109
110 #include "nbnxn_gpu.h"
111
112 void print_time(FILE                     *out,
113                 gmx_walltime_accounting_t walltime_accounting,
114                 gmx_int64_t               step,
115                 t_inputrec               *ir,
116                 t_commrec gmx_unused     *cr)
117 {
118     time_t finish;
119     char   timebuf[STRLEN];
120     double dt, elapsed_seconds, time_per_step;
121     char   buf[48];
122
123 #if !GMX_THREAD_MPI
124     if (!PAR(cr))
125 #endif
126     {
127         fprintf(out, "\r");
128     }
129     fprintf(out, "step %s", gmx_step_str(step, buf));
130     fflush(out);
131
132     if ((step >= ir->nstlist))
133     {
134         double seconds_since_epoch = gmx_gettime();
135         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
136         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
137         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
138
139         if (ir->nsteps >= 0)
140         {
141             if (dt >= 300)
142             {
143                 finish = (time_t) (seconds_since_epoch + dt);
144                 gmx_ctime_r(&finish, timebuf, STRLEN);
145                 sprintf(buf, "%s", timebuf);
146                 buf[strlen(buf)-1] = '\0';
147                 fprintf(out, ", will finish %s", buf);
148             }
149             else
150             {
151                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
152             }
153         }
154         else
155         {
156             fprintf(out, " performance: %.1f ns/day    ",
157                     ir->delta_t/1000*24*60*60/time_per_step);
158         }
159     }
160 #if !GMX_THREAD_MPI
161     if (PAR(cr))
162     {
163         fprintf(out, "\n");
164     }
165 #endif
166
167     fflush(out);
168 }
169
170 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
171                          double the_time)
172 {
173     char   time_string[STRLEN];
174
175     if (!fplog)
176     {
177         return;
178     }
179
180     {
181         int    i;
182         char   timebuf[STRLEN];
183         time_t temp_time = (time_t) the_time;
184
185         gmx_ctime_r(&temp_time, timebuf, STRLEN);
186         for (i = 0; timebuf[i] >= ' '; i++)
187         {
188             time_string[i] = timebuf[i];
189         }
190         time_string[i] = '\0';
191     }
192
193     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
194 }
195
196 void print_start(FILE *fplog, t_commrec *cr,
197                  gmx_walltime_accounting_t walltime_accounting,
198                  const char *name)
199 {
200     char buf[STRLEN];
201
202     sprintf(buf, "Started %s", name);
203     print_date_and_time(fplog, cr->nodeid, buf,
204                         walltime_accounting_get_start_time_stamp(walltime_accounting));
205 }
206
207 static void sum_forces(int start, int end, rvec f[], rvec flr[])
208 {
209     int i;
210
211     if (gmx_debug_at)
212     {
213         pr_rvecs(debug, 0, "fsr", f+start, end-start);
214         pr_rvecs(debug, 0, "flr", flr+start, end-start);
215     }
216     // cppcheck-suppress unreadVariable
217     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
218 #pragma omp parallel for num_threads(nt) schedule(static)
219     for (i = start; i < end; i++)
220     {
221         rvec_inc(f[i], flr[i]);
222     }
223 }
224
225 /*
226  * calc_f_el calculates forces due to an electric field.
227  *
228  * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
229  *
230  * Et[] contains the parameters for the time dependent
231  * part of the field.
232  * Ex[] contains the parameters for
233  * the spatial dependent part of the field.
234  * The function should return the energy due to the electric field
235  * (if any) but for now returns 0.
236  *
237  * WARNING:
238  * There can be problems with the virial.
239  * Since the field is not self-consistent this is unavoidable.
240  * For neutral molecules the virial is correct within this approximation.
241  * For neutral systems with many charged molecules the error is small.
242  * But for systems with a net charge or a few charged molecules
243  * the error can be significant when the field is high.
244  * Solution: implement a self-consistent electric field into PME.
245  */
246 static void calc_f_el(FILE *fp, int  start, int homenr,
247                       real charge[], rvec f[],
248                       t_cosines Ex[], t_cosines Et[], double t)
249 {
250     rvec Ext;
251     real t0;
252     int  i, m;
253
254     for (m = 0; (m < DIM); m++)
255     {
256         if (Et[m].n > 0)
257         {
258             if (Et[m].n == 3)
259             {
260                 t0     = Et[m].a[1];
261                 Ext[m] = std::cos(Et[m].a[0]*(t-t0))*std::exp(-gmx::square(t-t0)/(2.0*gmx::square(Et[m].a[2])));
262             }
263             else
264             {
265                 Ext[m] = std::cos(Et[m].a[0]*t);
266             }
267         }
268         else
269         {
270             Ext[m] = 1.0;
271         }
272         if (Ex[m].n > 0)
273         {
274             /* Convert the field strength from V/nm to MD-units */
275             Ext[m] *= Ex[m].a[0]*FIELDFAC;
276             for (i = start; (i < start+homenr); i++)
277             {
278                 f[i][m] += charge[i]*Ext[m];
279             }
280         }
281         else
282         {
283             Ext[m] = 0;
284         }
285     }
286     if (fp != NULL)
287     {
288         fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
289                 Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
290     }
291 }
292
293 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
294                         tensor vir_part, t_graph *graph, matrix box,
295                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
296 {
297     int    i;
298
299     /* The short-range virial from surrounding boxes */
300     clear_mat(vir_part);
301     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
302     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
303
304     /* Calculate partial virial, for local atoms only, based on short range.
305      * Total virial is computed in global_stat, called from do_md
306      */
307     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
308     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
309
310     /* Add position restraint contribution */
311     for (i = 0; i < DIM; i++)
312     {
313         vir_part[i][i] += fr->vir_diag_posres[i];
314     }
315
316     /* Add wall contribution */
317     for (i = 0; i < DIM; i++)
318     {
319         vir_part[i][ZZ] += fr->vir_wall_z[i];
320     }
321
322     if (debug)
323     {
324         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
325     }
326 }
327
328 static void pull_potential_wrapper(t_commrec *cr,
329                                    t_inputrec *ir,
330                                    matrix box, rvec x[],
331                                    rvec f[],
332                                    tensor vir_force,
333                                    t_mdatoms *mdatoms,
334                                    gmx_enerdata_t *enerd,
335                                    real *lambda,
336                                    double t,
337                                    gmx_wallcycle_t wcycle)
338 {
339     t_pbc  pbc;
340     real   dvdl;
341
342     /* Calculate the center of mass forces, this requires communication,
343      * which is why pull_potential is called close to other communication.
344      * The virial contribution is calculated directly,
345      * which is why we call pull_potential after calc_virial.
346      */
347     wallcycle_start(wcycle, ewcPULLPOT);
348     set_pbc(&pbc, ir->ePBC, box);
349     dvdl                     = 0;
350     enerd->term[F_COM_PULL] +=
351         pull_potential(ir->pull_work, mdatoms, &pbc,
352                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
353     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
354     wallcycle_stop(wcycle, ewcPULLPOT);
355 }
356
357 static void pme_receive_force_ener(t_commrec      *cr,
358                                    gmx_wallcycle_t wcycle,
359                                    gmx_enerdata_t *enerd,
360                                    t_forcerec     *fr)
361 {
362     real   e_q, e_lj, dvdl_q, dvdl_lj;
363     float  cycles_ppdpme, cycles_seppme;
364
365     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
366     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
367
368     /* In case of node-splitting, the PP nodes receive the long-range
369      * forces, virial and energy from the PME nodes here.
370      */
371     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
372     dvdl_q  = 0;
373     dvdl_lj = 0;
374     gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
375                       fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
376                       &cycles_seppme);
377     enerd->term[F_COUL_RECIP] += e_q;
378     enerd->term[F_LJ_RECIP]   += e_lj;
379     enerd->dvdl_lin[efptCOUL] += dvdl_q;
380     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
381
382     if (wcycle)
383     {
384         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
385     }
386     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
387 }
388
389 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
390                                gmx_int64_t step, real pforce, rvec *x, rvec *f)
391 {
392     int  i;
393     real pf2, fn2;
394     char buf[STEPSTRSIZE];
395
396     pf2 = gmx::square(pforce);
397     for (i = 0; i < md->homenr; i++)
398     {
399         fn2 = norm2(f[i]);
400         /* We also catch NAN, if the compiler does not optimize this away. */
401         if (fn2 >= pf2 || fn2 != fn2)
402         {
403             fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
404                     gmx_step_str(step, buf),
405                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(fn2));
406         }
407     }
408 }
409
410 static void post_process_forces(t_commrec *cr,
411                                 gmx_int64_t step,
412                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
413                                 gmx_localtop_t *top,
414                                 matrix box, rvec x[],
415                                 rvec f[],
416                                 tensor vir_force,
417                                 t_mdatoms *mdatoms,
418                                 t_graph *graph,
419                                 t_forcerec *fr, gmx_vsite_t *vsite,
420                                 int flags)
421 {
422     if (fr->bF_NoVirSum)
423     {
424         if (vsite)
425         {
426             /* Spread the mesh force on virtual sites to the other particles...
427              * This is parallellized. MPI communication is performed
428              * if the constructing atoms aren't local.
429              */
430             wallcycle_start(wcycle, ewcVSITESPREAD);
431             spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
432                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
433                            nrnb,
434                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
435             wallcycle_stop(wcycle, ewcVSITESPREAD);
436         }
437         if (flags & GMX_FORCE_VIRIAL)
438         {
439             /* Now add the forces, this is local */
440             if (fr->bDomDec)
441             {
442                 sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
443             }
444             else
445             {
446                 sum_forces(0, mdatoms->homenr,
447                            f, fr->f_novirsum);
448             }
449             if (EEL_FULL(fr->eeltype))
450             {
451                 /* Add the mesh contribution to the virial */
452                 m_add(vir_force, fr->vir_el_recip, vir_force);
453             }
454             if (EVDW_PME(fr->vdwtype))
455             {
456                 /* Add the mesh contribution to the virial */
457                 m_add(vir_force, fr->vir_lj_recip, vir_force);
458             }
459             if (debug)
460             {
461                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
462             }
463         }
464     }
465
466     if (fr->print_force >= 0)
467     {
468         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
469     }
470 }
471
472 static void do_nb_verlet(t_forcerec *fr,
473                          interaction_const_t *ic,
474                          gmx_enerdata_t *enerd,
475                          int flags, int ilocality,
476                          int clearF,
477                          t_nrnb *nrnb,
478                          gmx_wallcycle_t wcycle)
479 {
480     int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
481     nonbonded_verlet_group_t  *nbvg;
482     gmx_bool                   bUsingGpuKernels;
483
484     if (!(flags & GMX_FORCE_NONBONDED))
485     {
486         /* skip non-bonded calculation */
487         return;
488     }
489
490     nbvg = &fr->nbv->grp[ilocality];
491
492     /* GPU kernel launch overhead is already timed separately */
493     if (fr->cutoff_scheme != ecutsVERLET)
494     {
495         gmx_incons("Invalid cut-off scheme passed!");
496     }
497
498     bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
499
500     if (!bUsingGpuKernels)
501     {
502         wallcycle_sub_start(wcycle, ewcsNONBONDED);
503     }
504     switch (nbvg->kernel_type)
505     {
506         case nbnxnk4x4_PlainC:
507             nbnxn_kernel_ref(&nbvg->nbl_lists,
508                              nbvg->nbat, ic,
509                              fr->shift_vec,
510                              flags,
511                              clearF,
512                              fr->fshift[0],
513                              enerd->grpp.ener[egCOULSR],
514                              fr->bBHAM ?
515                              enerd->grpp.ener[egBHAMSR] :
516                              enerd->grpp.ener[egLJSR]);
517             break;
518
519         case nbnxnk4xN_SIMD_4xN:
520             nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
521                                   nbvg->nbat, ic,
522                                   nbvg->ewald_excl,
523                                   fr->shift_vec,
524                                   flags,
525                                   clearF,
526                                   fr->fshift[0],
527                                   enerd->grpp.ener[egCOULSR],
528                                   fr->bBHAM ?
529                                   enerd->grpp.ener[egBHAMSR] :
530                                   enerd->grpp.ener[egLJSR]);
531             break;
532         case nbnxnk4xN_SIMD_2xNN:
533             nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
534                                    nbvg->nbat, ic,
535                                    nbvg->ewald_excl,
536                                    fr->shift_vec,
537                                    flags,
538                                    clearF,
539                                    fr->fshift[0],
540                                    enerd->grpp.ener[egCOULSR],
541                                    fr->bBHAM ?
542                                    enerd->grpp.ener[egBHAMSR] :
543                                    enerd->grpp.ener[egLJSR]);
544             break;
545
546         case nbnxnk8x8x8_GPU:
547             nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
548             break;
549
550         case nbnxnk8x8x8_PlainC:
551             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
552                                  nbvg->nbat, ic,
553                                  fr->shift_vec,
554                                  flags,
555                                  clearF,
556                                  nbvg->nbat->out[0].f,
557                                  fr->fshift[0],
558                                  enerd->grpp.ener[egCOULSR],
559                                  fr->bBHAM ?
560                                  enerd->grpp.ener[egBHAMSR] :
561                                  enerd->grpp.ener[egLJSR]);
562             break;
563
564         default:
565             gmx_incons("Invalid nonbonded kernel type passed!");
566
567     }
568     if (!bUsingGpuKernels)
569     {
570         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
571     }
572
573     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
574     {
575         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
576     }
577     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
578              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
579     {
580         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
581     }
582     else
583     {
584         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
585     }
586     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
587     if (flags & GMX_FORCE_ENERGY)
588     {
589         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
590         enr_nbnxn_kernel_ljc += 1;
591         enr_nbnxn_kernel_lj  += 1;
592     }
593
594     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
595              nbvg->nbl_lists.natpair_ljq);
596     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
597              nbvg->nbl_lists.natpair_lj);
598     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
599     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
600              nbvg->nbl_lists.natpair_q);
601
602     if (ic->vdw_modifier == eintmodFORCESWITCH)
603     {
604         /* We add up the switch cost separately */
605         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
606                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
607     }
608     if (ic->vdw_modifier == eintmodPOTSWITCH)
609     {
610         /* We add up the switch cost separately */
611         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
612                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
613     }
614     if (ic->vdwtype == evdwPME)
615     {
616         /* We add up the LJ Ewald cost separately */
617         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
618                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
619     }
620 }
621
622 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
623                              t_forcerec           *fr,
624                              rvec                  x[],
625                              rvec                  f[],
626                              t_mdatoms            *mdatoms,
627                              t_lambda             *fepvals,
628                              real                 *lambda,
629                              gmx_enerdata_t       *enerd,
630                              int                   flags,
631                              t_nrnb               *nrnb,
632                              gmx_wallcycle_t       wcycle)
633 {
634     int              donb_flags;
635     nb_kernel_data_t kernel_data;
636     real             lam_i[efptNR];
637     real             dvdl_nb[efptNR];
638     int              th;
639     int              i, j;
640
641     donb_flags = 0;
642     /* Add short-range interactions */
643     donb_flags |= GMX_NONBONDED_DO_SR;
644
645     /* Currently all group scheme kernels always calculate (shift-)forces */
646     if (flags & GMX_FORCE_FORCES)
647     {
648         donb_flags |= GMX_NONBONDED_DO_FORCE;
649     }
650     if (flags & GMX_FORCE_VIRIAL)
651     {
652         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
653     }
654     if (flags & GMX_FORCE_ENERGY)
655     {
656         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
657     }
658
659     kernel_data.flags  = donb_flags;
660     kernel_data.lambda = lambda;
661     kernel_data.dvdl   = dvdl_nb;
662
663     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
664     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
665
666     /* reset free energy components */
667     for (i = 0; i < efptNR; i++)
668     {
669         dvdl_nb[i]  = 0;
670     }
671
672     assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
673
674     wallcycle_sub_start(wcycle, ewcsNONBONDED);
675 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
676     for (th = 0; th < nbl_lists->nnbl; th++)
677     {
678         try
679         {
680             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
681                                       x, f, fr, mdatoms, &kernel_data, nrnb);
682         }
683         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
684     }
685
686     if (fepvals->sc_alpha != 0)
687     {
688         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
689         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
690     }
691     else
692     {
693         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
694         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
695     }
696
697     /* If we do foreign lambda and we have soft-core interactions
698      * we have to recalculate the (non-linear) energies contributions.
699      */
700     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
701     {
702         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
703         kernel_data.lambda         = lam_i;
704         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
705         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
706         /* Note that we add to kernel_data.dvdl, but ignore the result */
707
708         for (i = 0; i < enerd->n_lambda; i++)
709         {
710             for (j = 0; j < efptNR; j++)
711             {
712                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
713             }
714             reset_foreign_enerdata(enerd);
715 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
716             for (th = 0; th < nbl_lists->nnbl; th++)
717             {
718                 try
719                 {
720                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
721                                               x, f, fr, mdatoms, &kernel_data, nrnb);
722                 }
723                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
724             }
725
726             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
727             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
728         }
729     }
730
731     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
732 }
733
734 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
735 {
736     return nbv != NULL && nbv->bUseGPU;
737 }
738
739 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
740 {
741     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
742
743     /* Note that we would like to avoid this conditional by putting it
744      * into the omp pragma instead, but then we still take the full
745      * omp parallel for overhead (at least with gcc5).
746      */
747     if (nth == 1)
748     {
749         for (int i = 0; i < n; i++)
750         {
751             clear_rvec(v[i]);
752         }
753     }
754     else
755     {
756 #pragma omp parallel for num_threads(nth) schedule(static)
757         for (int i = 0; i < n; i++)
758         {
759             clear_rvec(v[i]);
760         }
761     }
762 }
763
764 /*! \brief  This routine checks if the potential energy is finite.
765  *
766  * Note that passing this check does not guarantee finite forces,
767  * since those use slightly different arithmetics. But in most cases
768  * there is just a narrow coordinate range where forces are not finite
769  * and energies are finite.
770  *
771  * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
772  */
773 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
774 {
775     if (!std::isfinite(enerd->term[F_EPOT]))
776     {
777         gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
778                   enerd->term[F_EPOT],
779                   enerd->term[F_LJ],
780                   enerd->term[F_COUL_SR]);
781     }
782 }
783
784 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
785                          t_inputrec *inputrec,
786                          gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
787                          gmx_localtop_t *top,
788                          gmx_groups_t gmx_unused *groups,
789                          matrix box, rvec x[], history_t *hist,
790                          rvec f[],
791                          tensor vir_force,
792                          t_mdatoms *mdatoms,
793                          gmx_enerdata_t *enerd, t_fcdata *fcd,
794                          real *lambda, t_graph *graph,
795                          t_forcerec *fr, interaction_const_t *ic,
796                          gmx_vsite_t *vsite, rvec mu_tot,
797                          double t, FILE *field, gmx_edsam_t ed,
798                          gmx_bool bBornRadii,
799                          int flags)
800 {
801     int                 cg1, i, j;
802     int                 start, homenr;
803     double              mu[2*DIM];
804     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
805     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
806     gmx_bool            bDiffKernels = FALSE;
807     rvec                vzero, box_diag;
808     float               cycles_pme, cycles_force, cycles_wait_gpu;
809     /* TODO To avoid loss of precision, float can't be used for a
810      * cycle count. Build an object that can do this right and perhaps
811      * also be used by gmx_wallcycle_t */
812     gmx_cycles_t        cycleCountBeforeLocalWorkCompletes = 0;
813     nonbonded_verlet_t *nbv;
814
815     cycles_force    = 0;
816     cycles_wait_gpu = 0;
817     nbv             = fr->nbv;
818
819     start  = 0;
820     homenr = mdatoms->homenr;
821
822     clear_mat(vir_force);
823
824     if (DOMAINDECOMP(cr))
825     {
826         cg1 = cr->dd->ncg_tot;
827     }
828     else
829     {
830         cg1 = top->cgs.nr;
831     }
832     if (fr->n_tpi > 0)
833     {
834         cg1--;
835     }
836
837     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
838     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
839     bFillGrid     = (bNS && bStateChanged);
840     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
841     bDoForces     = (flags & GMX_FORCE_FORCES);
842     bUseGPU       = fr->nbv->bUseGPU;
843     bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
844
845     if (bStateChanged)
846     {
847         update_forcerec(fr, box);
848
849         if (inputrecNeedMutot(inputrec))
850         {
851             /* Calculate total (local) dipole moment in a temporary common array.
852              * This makes it possible to sum them over nodes faster.
853              */
854             calc_mu(start, homenr,
855                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
856                     mu, mu+DIM);
857         }
858     }
859
860     if (fr->ePBC != epbcNONE)
861     {
862         /* Compute shift vectors every step,
863          * because of pressure coupling or box deformation!
864          */
865         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
866         {
867             calc_shifts(box, fr->shift_vec);
868         }
869
870         if (bCalcCGCM)
871         {
872             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
873             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
874         }
875         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
876         {
877             unshift_self(graph, box, x);
878         }
879     }
880
881     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
882                                  fr->shift_vec, nbv->grp[0].nbat);
883
884 #if GMX_MPI
885     if (!(cr->duty & DUTY_PME))
886     {
887         gmx_bool bBS;
888         matrix   boxs;
889
890         /* Send particle coordinates to the pme nodes.
891          * Since this is only implemented for domain decomposition
892          * and domain decomposition does not use the graph,
893          * we do not need to worry about shifting.
894          */
895
896         int pme_flags = 0;
897
898         wallcycle_start(wcycle, ewcPP_PMESENDX);
899
900         bBS = (inputrec->nwall == 2);
901         if (bBS)
902         {
903             copy_mat(box, boxs);
904             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
905         }
906
907         if (EEL_PME(fr->eeltype))
908         {
909             pme_flags |= GMX_PME_DO_COULOMB;
910         }
911
912         if (EVDW_PME(fr->vdwtype))
913         {
914             pme_flags |= GMX_PME_DO_LJ;
915         }
916
917         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
918                                  mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
919                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
920                                  pme_flags, step);
921
922         wallcycle_stop(wcycle, ewcPP_PMESENDX);
923     }
924 #endif /* GMX_MPI */
925
926     /* do gridding for pair search */
927     if (bNS)
928     {
929         if (graph && bStateChanged)
930         {
931             /* Calculate intramolecular shift vectors to make molecules whole */
932             mk_mshift(fplog, graph, fr->ePBC, box, x);
933         }
934
935         clear_rvec(vzero);
936         box_diag[XX] = box[XX][XX];
937         box_diag[YY] = box[YY][YY];
938         box_diag[ZZ] = box[ZZ][ZZ];
939
940         wallcycle_start(wcycle, ewcNS);
941         if (!fr->bDomDec)
942         {
943             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
944             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
945                               0, vzero, box_diag,
946                               0, mdatoms->homenr, -1, fr->cginfo, x,
947                               0, NULL,
948                               nbv->grp[eintLocal].kernel_type,
949                               nbv->grp[eintLocal].nbat);
950             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
951         }
952         else
953         {
954             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
955             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
956                                        fr->cginfo, x,
957                                        nbv->grp[eintNonlocal].kernel_type,
958                                        nbv->grp[eintNonlocal].nbat);
959             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
960         }
961
962         if (nbv->ngrp == 1 ||
963             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
964         {
965             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
966                                nbv->nbs, mdatoms, fr->cginfo);
967         }
968         else
969         {
970             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
971                                nbv->nbs, mdatoms, fr->cginfo);
972             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
973                                nbv->nbs, mdatoms, fr->cginfo);
974         }
975         wallcycle_stop(wcycle, ewcNS);
976     }
977
978     /* initialize the GPU atom data and copy shift vector */
979     if (bUseGPU)
980     {
981         if (bNS)
982         {
983             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
984             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
985             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
986         }
987
988         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
989         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
990         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
991     }
992
993     /* do local pair search */
994     if (bNS)
995     {
996         wallcycle_start_nocount(wcycle, ewcNS);
997         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
998         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
999                             &top->excls,
1000                             ic->rlist,
1001                             nbv->min_ci_balanced,
1002                             &nbv->grp[eintLocal].nbl_lists,
1003                             eintLocal,
1004                             nbv->grp[eintLocal].kernel_type,
1005                             nrnb);
1006         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1007
1008         if (bUseGPU)
1009         {
1010             /* initialize local pair-list on the GPU */
1011             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1012                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1013                                     eintLocal);
1014         }
1015         wallcycle_stop(wcycle, ewcNS);
1016     }
1017     else
1018     {
1019         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1020         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1021         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
1022                                         nbv->grp[eintLocal].nbat);
1023         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1024         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1025     }
1026
1027     if (bUseGPU)
1028     {
1029         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1030         /* launch local nonbonded F on GPU */
1031         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1032                      nrnb, wcycle);
1033         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1034     }
1035
1036     /* Communicate coordinates and sum dipole if necessary +
1037        do non-local pair search */
1038     if (DOMAINDECOMP(cr))
1039     {
1040         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
1041                         nbv->grp[eintLocal].kernel_type);
1042
1043         if (bDiffKernels)
1044         {
1045             /* With GPU+CPU non-bonded calculations we need to copy
1046              * the local coordinates to the non-local nbat struct
1047              * (in CPU format) as the non-local kernel call also
1048              * calculates the local - non-local interactions.
1049              */
1050             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1051             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1052             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
1053                                             nbv->grp[eintNonlocal].nbat);
1054             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1055             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1056         }
1057
1058         if (bNS)
1059         {
1060             wallcycle_start_nocount(wcycle, ewcNS);
1061             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1062
1063             if (bDiffKernels)
1064             {
1065                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
1066             }
1067
1068             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
1069                                 &top->excls,
1070                                 ic->rlist,
1071                                 nbv->min_ci_balanced,
1072                                 &nbv->grp[eintNonlocal].nbl_lists,
1073                                 eintNonlocal,
1074                                 nbv->grp[eintNonlocal].kernel_type,
1075                                 nrnb);
1076
1077             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1078
1079             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1080             {
1081                 /* initialize non-local pair-list on the GPU */
1082                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1083                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1084                                         eintNonlocal);
1085             }
1086             wallcycle_stop(wcycle, ewcNS);
1087         }
1088         else
1089         {
1090             wallcycle_start(wcycle, ewcMOVEX);
1091             dd_move_x(cr->dd, box, x);
1092             wallcycle_stop(wcycle, ewcMOVEX);
1093
1094             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1095             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1096             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1097                                             nbv->grp[eintNonlocal].nbat);
1098             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1099             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1100         }
1101
1102         if (bUseGPU && !bDiffKernels)
1103         {
1104             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1105             /* launch non-local nonbonded F on GPU */
1106             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1107                          nrnb, wcycle);
1108             cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1109         }
1110     }
1111
1112     if (bUseGPU)
1113     {
1114         /* launch D2H copy-back F */
1115         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1116         if (DOMAINDECOMP(cr) && !bDiffKernels)
1117         {
1118             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1119                                      flags, eatNonlocal);
1120         }
1121         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1122                                  flags, eatLocal);
1123         cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1124     }
1125
1126     if (bStateChanged && inputrecNeedMutot(inputrec))
1127     {
1128         if (PAR(cr))
1129         {
1130             gmx_sumd(2*DIM, mu, cr);
1131         }
1132
1133         for (i = 0; i < 2; i++)
1134         {
1135             for (j = 0; j < DIM; j++)
1136             {
1137                 fr->mu_tot[i][j] = mu[i*DIM + j];
1138             }
1139         }
1140     }
1141     if (fr->efep == efepNO)
1142     {
1143         copy_rvec(fr->mu_tot[0], mu_tot);
1144     }
1145     else
1146     {
1147         for (j = 0; j < DIM; j++)
1148         {
1149             mu_tot[j] =
1150                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1151                 lambda[efptCOUL]*fr->mu_tot[1][j];
1152         }
1153     }
1154
1155     /* Reset energies */
1156     reset_enerdata(enerd);
1157     clear_rvecs(SHIFTS, fr->fshift);
1158
1159     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1160     {
1161         wallcycle_start(wcycle, ewcPPDURINGPME);
1162         dd_force_flop_start(cr->dd, nrnb);
1163     }
1164
1165     if (inputrec->bRot)
1166     {
1167         /* Enforced rotation has its own cycle counter that starts after the collective
1168          * coordinates have been communicated. It is added to ddCyclF to allow
1169          * for proper load-balancing */
1170         wallcycle_start(wcycle, ewcROT);
1171         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1172         wallcycle_stop(wcycle, ewcROT);
1173     }
1174
1175     /* Start the force cycle counter.
1176      * This counter is stopped after do_force_lowlevel.
1177      * No parallel communication should occur while this counter is running,
1178      * since that will interfere with the dynamic load balancing.
1179      */
1180     wallcycle_start(wcycle, ewcFORCE);
1181     if (bDoForces)
1182     {
1183         /* Reset forces for which the virial is calculated separately:
1184          * PME/Ewald forces if necessary */
1185         if (fr->bF_NoVirSum)
1186         {
1187             if (flags & GMX_FORCE_VIRIAL)
1188             {
1189                 fr->f_novirsum = fr->f_novirsum_alloc;
1190             }
1191             else
1192             {
1193                 /* We are not calculating the pressure so we do not need
1194                  * a separate array for forces that do not contribute
1195                  * to the pressure.
1196                  */
1197                 fr->f_novirsum = f;
1198             }
1199         }
1200
1201         if (fr->bF_NoVirSum)
1202         {
1203             if (flags & GMX_FORCE_VIRIAL)
1204             {
1205                 if (fr->bDomDec)
1206                 {
1207                     clear_rvecs_omp(fr->f_novirsum_n, fr->f_novirsum);
1208                 }
1209                 else
1210                 {
1211                     clear_rvecs_omp(homenr, fr->f_novirsum+start);
1212                 }
1213             }
1214         }
1215         /* Clear the short- and long-range forces */
1216         clear_rvecs_omp(fr->natoms_force_constr, f);
1217
1218         clear_rvec(fr->vir_diag_posres);
1219     }
1220
1221     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1222     {
1223         clear_pull_forces(inputrec->pull_work);
1224     }
1225
1226     /* We calculate the non-bonded forces, when done on the CPU, here.
1227      * We do this before calling do_force_lowlevel, because in that
1228      * function, the listed forces are calculated before PME, which
1229      * does communication.  With this order, non-bonded and listed
1230      * force calculation imbalance can be balanced out by the domain
1231      * decomposition load balancing.
1232      */
1233
1234     if (!bUseOrEmulGPU)
1235     {
1236         /* Maybe we should move this into do_force_lowlevel */
1237         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1238                      nrnb, wcycle);
1239     }
1240
1241     if (fr->efep != efepNO)
1242     {
1243         /* Calculate the local and non-local free energy interactions here.
1244          * Happens here on the CPU both with and without GPU.
1245          */
1246         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1247         {
1248             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1249                              fr, x, f, mdatoms,
1250                              inputrec->fepvals, lambda,
1251                              enerd, flags, nrnb, wcycle);
1252         }
1253
1254         if (DOMAINDECOMP(cr) &&
1255             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1256         {
1257             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1258                              fr, x, f, mdatoms,
1259                              inputrec->fepvals, lambda,
1260                              enerd, flags, nrnb, wcycle);
1261         }
1262     }
1263
1264     if (!bUseOrEmulGPU || bDiffKernels)
1265     {
1266         int aloc;
1267
1268         if (DOMAINDECOMP(cr))
1269         {
1270             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1271                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1272                          nrnb, wcycle);
1273         }
1274
1275         if (!bUseOrEmulGPU)
1276         {
1277             aloc = eintLocal;
1278         }
1279         else
1280         {
1281             aloc = eintNonlocal;
1282         }
1283
1284         /* Add all the non-bonded force to the normal force array.
1285          * This can be split into a local and a non-local part when overlapping
1286          * communication with calculation with domain decomposition.
1287          */
1288         cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1289         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1290         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1291         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1292         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1293         cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1294         wallcycle_start_nocount(wcycle, ewcFORCE);
1295
1296         /* if there are multiple fshift output buffers reduce them */
1297         if ((flags & GMX_FORCE_VIRIAL) &&
1298             nbv->grp[aloc].nbl_lists.nnbl > 1)
1299         {
1300             /* This is not in a subcounter because it takes a
1301                negligible and constant-sized amount of time */
1302             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1303                                                      fr->fshift);
1304         }
1305     }
1306
1307     /* update QMMMrec, if necessary */
1308     if (fr->bQMMM)
1309     {
1310         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1311     }
1312
1313     /* Compute the bonded and non-bonded energies and optionally forces */
1314     do_force_lowlevel(fr, inputrec, &(top->idef),
1315                       cr, nrnb, wcycle, mdatoms,
1316                       x, hist, f, enerd, fcd, top, fr->born,
1317                       bBornRadii, box,
1318                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1319                       flags, &cycles_pme);
1320
1321     cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1322
1323     if (ed)
1324     {
1325         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1326     }
1327
1328     if (bUseOrEmulGPU && !bDiffKernels)
1329     {
1330         /* wait for non-local forces (or calculate in emulation mode) */
1331         if (DOMAINDECOMP(cr))
1332         {
1333             if (bUseGPU)
1334             {
1335                 float cycles_tmp;
1336
1337                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1338                 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1339                                        flags, eatNonlocal,
1340                                        enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1341                                        fr->fshift);
1342                 cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1343                 cycles_wait_gpu += cycles_tmp;
1344                 cycles_force    += cycles_tmp;
1345             }
1346             else
1347             {
1348                 wallcycle_start_nocount(wcycle, ewcFORCE);
1349                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1350                              nrnb, wcycle);
1351                 cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1352             }
1353             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1354             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1355             /* skip the reduction if there was no non-local work to do */
1356             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1357             {
1358                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1359                                                nbv->grp[eintNonlocal].nbat, f);
1360             }
1361             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1362             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1363         }
1364     }
1365
1366     if (bDoForces && DOMAINDECOMP(cr))
1367     {
1368         if (bUseGPU)
1369         {
1370             /* We are done with the CPU compute, but the GPU local non-bonded
1371              * kernel can still be running while we communicate the forces.
1372              * We start a counter here, so we can, hopefully, time the rest
1373              * of the GPU kernel execution and data transfer.
1374              */
1375             cycleCountBeforeLocalWorkCompletes = gmx_cycles_read();
1376         }
1377
1378         /* Communicate the forces */
1379         wallcycle_start(wcycle, ewcMOVEF);
1380         dd_move_f(cr->dd, f, fr->fshift);
1381         wallcycle_stop(wcycle, ewcMOVEF);
1382     }
1383
1384     if (bUseOrEmulGPU)
1385     {
1386         /* wait for local forces (or calculate in emulation mode) */
1387         if (bUseGPU)
1388         {
1389             float       cycles_tmp, cycles_wait_est;
1390             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1391              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1392              * but even with a step of 0.1 ms the difference is less than 1%
1393              * of the step time.
1394              */
1395             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1396
1397             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1398             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1399                                    flags, eatLocal,
1400                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1401                                    fr->fshift);
1402             cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1403
1404             if (bDoForces && DOMAINDECOMP(cr))
1405             {
1406                 cycles_wait_est = gmx_cycles_read() - cycleCountBeforeLocalWorkCompletes;
1407
1408                 if (cycles_tmp < gpuWaitApiOverheadMargin)
1409                 {
1410                     /* We measured few cycles, it could be that the kernel
1411                      * and transfer finished earlier and there was no actual
1412                      * wait time, only API call overhead.
1413                      * Then the actual time could be anywhere between 0 and
1414                      * cycles_wait_est. As a compromise, we use half the time.
1415                      */
1416                     cycles_wait_est *= 0.5f;
1417                 }
1418             }
1419             else
1420             {
1421                 /* No force communication so we actually timed the wait */
1422                 cycles_wait_est = cycles_tmp;
1423             }
1424             /* Even though this is after dd_move_f, the actual task we are
1425              * waiting for runs asynchronously with dd_move_f and we usually
1426              * have nothing to balance it with, so we can and should add
1427              * the time to the force time for load balancing.
1428              */
1429             cycles_force    += cycles_wait_est;
1430             cycles_wait_gpu += cycles_wait_est;
1431
1432             /* now clear the GPU outputs while we finish the step on the CPU */
1433             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1434             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1435             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1436         }
1437         else
1438         {
1439             wallcycle_start_nocount(wcycle, ewcFORCE);
1440             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1441                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1442                          nrnb, wcycle);
1443             wallcycle_stop(wcycle, ewcFORCE);
1444         }
1445         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1446         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1447         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1448                                        nbv->grp[eintLocal].nbat, f);
1449         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1450         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1451     }
1452
1453     if (DOMAINDECOMP(cr))
1454     {
1455         dd_force_flop_stop(cr->dd, nrnb);
1456         if (wcycle)
1457         {
1458             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1459             if (bUseGPU)
1460             {
1461                 dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
1462             }
1463         }
1464     }
1465
1466     if (bDoForces)
1467     {
1468         if (inputrecElecField(inputrec))
1469         {
1470             /* Compute forces due to electric field */
1471             calc_f_el(MASTER(cr) ? field : NULL,
1472                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1473                       inputrec->ex, inputrec->et, t);
1474         }
1475
1476         /* If we have NoVirSum forces, but we do not calculate the virial,
1477          * we sum fr->f_novirsum=f later.
1478          */
1479         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1480         {
1481             wallcycle_start(wcycle, ewcVSITESPREAD);
1482             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1483                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1484             wallcycle_stop(wcycle, ewcVSITESPREAD);
1485         }
1486
1487         if (flags & GMX_FORCE_VIRIAL)
1488         {
1489             /* Calculation of the virial must be done after vsites! */
1490             calc_virial(0, mdatoms->homenr, x, f,
1491                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1492         }
1493     }
1494
1495     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1496     {
1497         /* Since the COM pulling is always done mass-weighted, no forces are
1498          * applied to vsites and this call can be done after vsite spreading.
1499          */
1500         pull_potential_wrapper(cr, inputrec, box, x,
1501                                f, vir_force, mdatoms, enerd, lambda, t,
1502                                wcycle);
1503     }
1504
1505     /* Add the forces from enforced rotation potentials (if any) */
1506     if (inputrec->bRot)
1507     {
1508         wallcycle_start(wcycle, ewcROTadd);
1509         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1510         wallcycle_stop(wcycle, ewcROTadd);
1511     }
1512
1513     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1514     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1515
1516     if (PAR(cr) && !(cr->duty & DUTY_PME))
1517     {
1518         /* In case of node-splitting, the PP nodes receive the long-range
1519          * forces, virial and energy from the PME nodes here.
1520          */
1521         pme_receive_force_ener(cr, wcycle, enerd, fr);
1522     }
1523
1524     if (bDoForces)
1525     {
1526         post_process_forces(cr, step, nrnb, wcycle,
1527                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1528                             flags);
1529     }
1530
1531     if (flags & GMX_FORCE_ENERGY)
1532     {
1533         /* Sum the potential energy terms from group contributions */
1534         sum_epot(&(enerd->grpp), enerd->term);
1535
1536         if (!EI_TPI(inputrec->eI))
1537         {
1538             checkPotentialEnergyValidity(enerd);
1539         }
1540     }
1541 }
1542
1543 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1544                         t_inputrec *inputrec,
1545                         gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1546                         gmx_localtop_t *top,
1547                         gmx_groups_t *groups,
1548                         matrix box, rvec x[], history_t *hist,
1549                         rvec f[],
1550                         tensor vir_force,
1551                         t_mdatoms *mdatoms,
1552                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1553                         real *lambda, t_graph *graph,
1554                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1555                         double t, FILE *field, gmx_edsam_t ed,
1556                         gmx_bool bBornRadii,
1557                         int flags)
1558 {
1559     int        cg0, cg1, i, j;
1560     int        start, homenr;
1561     double     mu[2*DIM];
1562     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1563     gmx_bool   bDoForces;
1564     float      cycles_pme, cycles_force;
1565
1566     start  = 0;
1567     homenr = mdatoms->homenr;
1568
1569     clear_mat(vir_force);
1570
1571     cg0 = 0;
1572     if (DOMAINDECOMP(cr))
1573     {
1574         cg1 = cr->dd->ncg_tot;
1575     }
1576     else
1577     {
1578         cg1 = top->cgs.nr;
1579     }
1580     if (fr->n_tpi > 0)
1581     {
1582         cg1--;
1583     }
1584
1585     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1586     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1587     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1588     bFillGrid      = (bNS && bStateChanged);
1589     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1590     bDoForces      = (flags & GMX_FORCE_FORCES);
1591
1592     if (bStateChanged)
1593     {
1594         update_forcerec(fr, box);
1595
1596         if (inputrecNeedMutot(inputrec))
1597         {
1598             /* Calculate total (local) dipole moment in a temporary common array.
1599              * This makes it possible to sum them over nodes faster.
1600              */
1601             calc_mu(start, homenr,
1602                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1603                     mu, mu+DIM);
1604         }
1605     }
1606
1607     if (fr->ePBC != epbcNONE)
1608     {
1609         /* Compute shift vectors every step,
1610          * because of pressure coupling or box deformation!
1611          */
1612         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1613         {
1614             calc_shifts(box, fr->shift_vec);
1615         }
1616
1617         if (bCalcCGCM)
1618         {
1619             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1620                                      &(top->cgs), x, fr->cg_cm);
1621             inc_nrnb(nrnb, eNR_CGCM, homenr);
1622             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1623         }
1624         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1625         {
1626             unshift_self(graph, box, x);
1627         }
1628     }
1629     else if (bCalcCGCM)
1630     {
1631         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1632         inc_nrnb(nrnb, eNR_CGCM, homenr);
1633     }
1634
1635     if (bCalcCGCM && gmx_debug_at)
1636     {
1637         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1638     }
1639
1640 #if GMX_MPI
1641     if (!(cr->duty & DUTY_PME))
1642     {
1643         gmx_bool bBS;
1644         matrix   boxs;
1645
1646         /* Send particle coordinates to the pme nodes.
1647          * Since this is only implemented for domain decomposition
1648          * and domain decomposition does not use the graph,
1649          * we do not need to worry about shifting.
1650          */
1651
1652         int pme_flags = 0;
1653
1654         wallcycle_start(wcycle, ewcPP_PMESENDX);
1655
1656         bBS = (inputrec->nwall == 2);
1657         if (bBS)
1658         {
1659             copy_mat(box, boxs);
1660             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1661         }
1662
1663         if (EEL_PME(fr->eeltype))
1664         {
1665             pme_flags |= GMX_PME_DO_COULOMB;
1666         }
1667
1668         if (EVDW_PME(fr->vdwtype))
1669         {
1670             pme_flags |= GMX_PME_DO_LJ;
1671         }
1672
1673         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1674                                  mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
1675                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1676                                  pme_flags, step);
1677
1678         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1679     }
1680 #endif /* GMX_MPI */
1681
1682     /* Communicate coordinates and sum dipole if necessary */
1683     if (DOMAINDECOMP(cr))
1684     {
1685         wallcycle_start(wcycle, ewcMOVEX);
1686         dd_move_x(cr->dd, box, x);
1687         wallcycle_stop(wcycle, ewcMOVEX);
1688     }
1689
1690     if (inputrecNeedMutot(inputrec))
1691     {
1692         if (bStateChanged)
1693         {
1694             if (PAR(cr))
1695             {
1696                 gmx_sumd(2*DIM, mu, cr);
1697             }
1698             for (i = 0; i < 2; i++)
1699             {
1700                 for (j = 0; j < DIM; j++)
1701                 {
1702                     fr->mu_tot[i][j] = mu[i*DIM + j];
1703                 }
1704             }
1705         }
1706         if (fr->efep == efepNO)
1707         {
1708             copy_rvec(fr->mu_tot[0], mu_tot);
1709         }
1710         else
1711         {
1712             for (j = 0; j < DIM; j++)
1713             {
1714                 mu_tot[j] =
1715                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1716             }
1717         }
1718     }
1719
1720     /* Reset energies */
1721     reset_enerdata(enerd);
1722     clear_rvecs(SHIFTS, fr->fshift);
1723
1724     if (bNS)
1725     {
1726         wallcycle_start(wcycle, ewcNS);
1727
1728         if (graph && bStateChanged)
1729         {
1730             /* Calculate intramolecular shift vectors to make molecules whole */
1731             mk_mshift(fplog, graph, fr->ePBC, box, x);
1732         }
1733
1734         /* Do the actual neighbour searching */
1735         ns(fplog, fr, box,
1736            groups, top, mdatoms,
1737            cr, nrnb, bFillGrid);
1738
1739         wallcycle_stop(wcycle, ewcNS);
1740     }
1741
1742     if (inputrec->implicit_solvent && bNS)
1743     {
1744         make_gb_nblist(cr, inputrec->gb_algorithm,
1745                        x, box, fr, &top->idef, graph, fr->born);
1746     }
1747
1748     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1749     {
1750         wallcycle_start(wcycle, ewcPPDURINGPME);
1751         dd_force_flop_start(cr->dd, nrnb);
1752     }
1753
1754     if (inputrec->bRot)
1755     {
1756         /* Enforced rotation has its own cycle counter that starts after the collective
1757          * coordinates have been communicated. It is added to ddCyclF to allow
1758          * for proper load-balancing */
1759         wallcycle_start(wcycle, ewcROT);
1760         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1761         wallcycle_stop(wcycle, ewcROT);
1762     }
1763
1764     /* Start the force cycle counter.
1765      * This counter is stopped after do_force_lowlevel.
1766      * No parallel communication should occur while this counter is running,
1767      * since that will interfere with the dynamic load balancing.
1768      */
1769     wallcycle_start(wcycle, ewcFORCE);
1770
1771     if (bDoForces)
1772     {
1773         /* Reset forces for which the virial is calculated separately:
1774          * PME/Ewald forces if necessary */
1775         if (fr->bF_NoVirSum)
1776         {
1777             if (flags & GMX_FORCE_VIRIAL)
1778             {
1779                 fr->f_novirsum = fr->f_novirsum_alloc;
1780                 if (fr->bDomDec)
1781                 {
1782                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1783                 }
1784                 else
1785                 {
1786                     clear_rvecs(homenr, fr->f_novirsum+start);
1787                 }
1788             }
1789             else
1790             {
1791                 /* We are not calculating the pressure so we do not need
1792                  * a separate array for forces that do not contribute
1793                  * to the pressure.
1794                  */
1795                 fr->f_novirsum = f;
1796             }
1797         }
1798
1799         /* Clear the short- and long-range forces */
1800         clear_rvecs(fr->natoms_force_constr, f);
1801
1802         clear_rvec(fr->vir_diag_posres);
1803     }
1804     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1805     {
1806         clear_pull_forces(inputrec->pull_work);
1807     }
1808
1809     /* update QMMMrec, if necessary */
1810     if (fr->bQMMM)
1811     {
1812         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1813     }
1814
1815     /* Compute the bonded and non-bonded energies and optionally forces */
1816     do_force_lowlevel(fr, inputrec, &(top->idef),
1817                       cr, nrnb, wcycle, mdatoms,
1818                       x, hist, f, enerd, fcd, top, fr->born,
1819                       bBornRadii, box,
1820                       inputrec->fepvals, lambda,
1821                       graph, &(top->excls), fr->mu_tot,
1822                       flags,
1823                       &cycles_pme);
1824
1825     cycles_force = wallcycle_stop(wcycle, ewcFORCE);
1826
1827     if (ed)
1828     {
1829         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1830     }
1831
1832     if (DOMAINDECOMP(cr))
1833     {
1834         dd_force_flop_stop(cr->dd, nrnb);
1835         if (wcycle)
1836         {
1837             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1838         }
1839     }
1840
1841     if (bDoForces)
1842     {
1843         if (inputrecElecField(inputrec))
1844         {
1845             /* Compute forces due to electric field */
1846             calc_f_el(MASTER(cr) ? field : NULL,
1847                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1848                       inputrec->ex, inputrec->et, t);
1849         }
1850
1851         /* Communicate the forces */
1852         if (DOMAINDECOMP(cr))
1853         {
1854             wallcycle_start(wcycle, ewcMOVEF);
1855             dd_move_f(cr->dd, f, fr->fshift);
1856             /* Do we need to communicate the separate force array
1857              * for terms that do not contribute to the single sum virial?
1858              * Position restraints and electric fields do not introduce
1859              * inter-cg forces, only full electrostatics methods do.
1860              * When we do not calculate the virial, fr->f_novirsum = f,
1861              * so we have already communicated these forces.
1862              */
1863             if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1864                 (flags & GMX_FORCE_VIRIAL))
1865             {
1866                 dd_move_f(cr->dd, fr->f_novirsum, NULL);
1867             }
1868             wallcycle_stop(wcycle, ewcMOVEF);
1869         }
1870
1871         /* If we have NoVirSum forces, but we do not calculate the virial,
1872          * we sum fr->f_novirsum=f later.
1873          */
1874         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1875         {
1876             wallcycle_start(wcycle, ewcVSITESPREAD);
1877             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1878                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1879             wallcycle_stop(wcycle, ewcVSITESPREAD);
1880         }
1881
1882         if (flags & GMX_FORCE_VIRIAL)
1883         {
1884             /* Calculation of the virial must be done after vsites! */
1885             calc_virial(0, mdatoms->homenr, x, f,
1886                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1887         }
1888     }
1889
1890     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1891     {
1892         pull_potential_wrapper(cr, inputrec, box, x,
1893                                f, vir_force, mdatoms, enerd, lambda, t,
1894                                wcycle);
1895     }
1896
1897     /* Add the forces from enforced rotation potentials (if any) */
1898     if (inputrec->bRot)
1899     {
1900         wallcycle_start(wcycle, ewcROTadd);
1901         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1902         wallcycle_stop(wcycle, ewcROTadd);
1903     }
1904
1905     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1906     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1907
1908     if (PAR(cr) && !(cr->duty & DUTY_PME))
1909     {
1910         /* In case of node-splitting, the PP nodes receive the long-range
1911          * forces, virial and energy from the PME nodes here.
1912          */
1913         pme_receive_force_ener(cr, wcycle, enerd, fr);
1914     }
1915
1916     if (bDoForces)
1917     {
1918         post_process_forces(cr, step, nrnb, wcycle,
1919                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1920                             flags);
1921     }
1922
1923     if (flags & GMX_FORCE_ENERGY)
1924     {
1925         /* Sum the potential energy terms from group contributions */
1926         sum_epot(&(enerd->grpp), enerd->term);
1927
1928         if (!EI_TPI(inputrec->eI))
1929         {
1930             checkPotentialEnergyValidity(enerd);
1931         }
1932     }
1933
1934 }
1935
1936 void do_force(FILE *fplog, t_commrec *cr,
1937               t_inputrec *inputrec,
1938               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1939               gmx_localtop_t *top,
1940               gmx_groups_t *groups,
1941               matrix box, rvec x[], history_t *hist,
1942               rvec f[],
1943               tensor vir_force,
1944               t_mdatoms *mdatoms,
1945               gmx_enerdata_t *enerd, t_fcdata *fcd,
1946               real *lambda, t_graph *graph,
1947               t_forcerec *fr,
1948               gmx_vsite_t *vsite, rvec mu_tot,
1949               double t, FILE *field, gmx_edsam_t ed,
1950               gmx_bool bBornRadii,
1951               int flags)
1952 {
1953     /* modify force flag if not doing nonbonded */
1954     if (!fr->bNonbonded)
1955     {
1956         flags &= ~GMX_FORCE_NONBONDED;
1957     }
1958
1959     switch (inputrec->cutoff_scheme)
1960     {
1961         case ecutsVERLET:
1962             do_force_cutsVERLET(fplog, cr, inputrec,
1963                                 step, nrnb, wcycle,
1964                                 top,
1965                                 groups,
1966                                 box, x, hist,
1967                                 f, vir_force,
1968                                 mdatoms,
1969                                 enerd, fcd,
1970                                 lambda, graph,
1971                                 fr, fr->ic,
1972                                 vsite, mu_tot,
1973                                 t, field, ed,
1974                                 bBornRadii,
1975                                 flags);
1976             break;
1977         case ecutsGROUP:
1978             do_force_cutsGROUP(fplog, cr, inputrec,
1979                                step, nrnb, wcycle,
1980                                top,
1981                                groups,
1982                                box, x, hist,
1983                                f, vir_force,
1984                                mdatoms,
1985                                enerd, fcd,
1986                                lambda, graph,
1987                                fr, vsite, mu_tot,
1988                                t, field, ed,
1989                                bBornRadii,
1990                                flags);
1991             break;
1992         default:
1993             gmx_incons("Invalid cut-off scheme passed!");
1994     }
1995 }
1996
1997
1998 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1999                         t_inputrec *ir, t_mdatoms *md,
2000                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
2001                         t_forcerec *fr, gmx_localtop_t *top)
2002 {
2003     int             i, m, start, end;
2004     gmx_int64_t     step;
2005     real            dt = ir->delta_t;
2006     real            dvdl_dum;
2007     rvec           *savex;
2008
2009     /* We need to allocate one element extra, since we might use
2010      * (unaligned) 4-wide SIMD loads to access rvec entries.
2011      */
2012     snew(savex, state->natoms + 1);
2013
2014     start = 0;
2015     end   = md->homenr;
2016
2017     if (debug)
2018     {
2019         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2020                 start, md->homenr, end);
2021     }
2022     /* Do a first constrain to reset particles... */
2023     step = ir->init_step;
2024     if (fplog)
2025     {
2026         char buf[STEPSTRSIZE];
2027         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2028                 gmx_step_str(step, buf));
2029     }
2030     dvdl_dum = 0;
2031
2032     /* constrain the current position */
2033     constrain(NULL, TRUE, FALSE, constr, &(top->idef),
2034               ir, cr, step, 0, 1.0, md,
2035               state->x, state->x, NULL,
2036               fr->bMolPBC, state->box,
2037               state->lambda[efptBONDED], &dvdl_dum,
2038               NULL, NULL, nrnb, econqCoord);
2039     if (EI_VV(ir->eI))
2040     {
2041         /* constrain the inital velocity, and save it */
2042         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2043         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
2044                   ir, cr, step, 0, 1.0, md,
2045                   state->x, state->v, state->v,
2046                   fr->bMolPBC, state->box,
2047                   state->lambda[efptBONDED], &dvdl_dum,
2048                   NULL, NULL, nrnb, econqVeloc);
2049     }
2050     /* constrain the inital velocities at t-dt/2 */
2051     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2052     {
2053         for (i = start; (i < end); i++)
2054         {
2055             for (m = 0; (m < DIM); m++)
2056             {
2057                 /* Reverse the velocity */
2058                 state->v[i][m] = -state->v[i][m];
2059                 /* Store the position at t-dt in buf */
2060                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2061             }
2062         }
2063         /* Shake the positions at t=-dt with the positions at t=0
2064          * as reference coordinates.
2065          */
2066         if (fplog)
2067         {
2068             char buf[STEPSTRSIZE];
2069             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2070                     gmx_step_str(step, buf));
2071         }
2072         dvdl_dum = 0;
2073         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
2074                   ir, cr, step, -1, 1.0, md,
2075                   state->x, savex, NULL,
2076                   fr->bMolPBC, state->box,
2077                   state->lambda[efptBONDED], &dvdl_dum,
2078                   state->v, NULL, nrnb, econqCoord);
2079
2080         for (i = start; i < end; i++)
2081         {
2082             for (m = 0; m < DIM; m++)
2083             {
2084                 /* Re-reverse the velocities */
2085                 state->v[i][m] = -state->v[i][m];
2086             }
2087         }
2088     }
2089     sfree(savex);
2090 }
2091
2092
2093 static void
2094 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2095                 double *enerout, double *virout)
2096 {
2097     double enersum, virsum;
2098     double invscale, invscale2, invscale3;
2099     double r, ea, eb, ec, pa, pb, pc, pd;
2100     double y0, f, g, h;
2101     int    ri, offset;
2102     double tabfactor;
2103
2104     invscale  = 1.0/scale;
2105     invscale2 = invscale*invscale;
2106     invscale3 = invscale*invscale2;
2107
2108     /* Following summation derived from cubic spline definition,
2109      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2110      * the cubic spline.  We first calculate the negative of the
2111      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2112      * add the more standard, abrupt cutoff correction to that result,
2113      * yielding the long-range correction for a switched function.  We
2114      * perform both the pressure and energy loops at the same time for
2115      * simplicity, as the computational cost is low. */
2116
2117     if (offstart == 0)
2118     {
2119         /* Since the dispersion table has been scaled down a factor
2120          * 6.0 and the repulsion a factor 12.0 to compensate for the
2121          * c6/c12 parameters inside nbfp[] being scaled up (to save
2122          * flops in kernels), we need to correct for this.
2123          */
2124         tabfactor = 6.0;
2125     }
2126     else
2127     {
2128         tabfactor = 12.0;
2129     }
2130
2131     enersum = 0.0;
2132     virsum  = 0.0;
2133     for (ri = rstart; ri < rend; ++ri)
2134     {
2135         r  = ri*invscale;
2136         ea = invscale3;
2137         eb = 2.0*invscale2*r;
2138         ec = invscale*r*r;
2139
2140         pa = invscale3;
2141         pb = 3.0*invscale2*r;
2142         pc = 3.0*invscale*r*r;
2143         pd = r*r*r;
2144
2145         /* this "8" is from the packing in the vdwtab array - perhaps
2146            should be defined? */
2147
2148         offset = 8*ri + offstart;
2149         y0     = vdwtab[offset];
2150         f      = vdwtab[offset+1];
2151         g      = vdwtab[offset+2];
2152         h      = vdwtab[offset+3];
2153
2154         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2155         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2156     }
2157     *enerout = 4.0*M_PI*enersum*tabfactor;
2158     *virout  = 4.0*M_PI*virsum*tabfactor;
2159 }
2160
2161 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2162 {
2163     double   eners[2], virs[2], enersum, virsum;
2164     double   r0, rc3, rc9;
2165     int      ri0, ri1, i;
2166     real     scale, *vdwtab;
2167
2168     fr->enershiftsix    = 0;
2169     fr->enershifttwelve = 0;
2170     fr->enerdiffsix     = 0;
2171     fr->enerdifftwelve  = 0;
2172     fr->virdiffsix      = 0;
2173     fr->virdifftwelve   = 0;
2174
2175     if (eDispCorr != edispcNO)
2176     {
2177         for (i = 0; i < 2; i++)
2178         {
2179             eners[i] = 0;
2180             virs[i]  = 0;
2181         }
2182         if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2183             (fr->vdw_modifier == eintmodPOTSWITCH) ||
2184             (fr->vdw_modifier == eintmodFORCESWITCH) ||
2185             (fr->vdwtype == evdwSHIFT) ||
2186             (fr->vdwtype == evdwSWITCH))
2187         {
2188             if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2189                  (fr->vdw_modifier == eintmodFORCESWITCH) ||
2190                  (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2191             {
2192                 gmx_fatal(FARGS,
2193                           "With dispersion correction rvdw-switch can not be zero "
2194                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2195             }
2196
2197             /* TODO This code depends on the logic in tables.c that
2198                constructs the table layout, which should be made
2199                explicit in future cleanup. */
2200             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2201                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2202             scale  = fr->dispersionCorrectionTable->scale;
2203             vdwtab = fr->dispersionCorrectionTable->data;
2204
2205             /* Round the cut-offs to exact table values for precision */
2206             ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
2207             ri1  = static_cast<int>(ceil(fr->rvdw*scale));
2208
2209             /* The code below has some support for handling force-switching, i.e.
2210              * when the force (instead of potential) is switched over a limited
2211              * region. This leads to a constant shift in the potential inside the
2212              * switching region, which we can handle by adding a constant energy
2213              * term in the force-switch case just like when we do potential-shift.
2214              *
2215              * For now this is not enabled, but to keep the functionality in the
2216              * code we check separately for switch and shift. When we do force-switch
2217              * the shifting point is rvdw_switch, while it is the cutoff when we
2218              * have a classical potential-shift.
2219              *
2220              * For a pure potential-shift the potential has a constant shift
2221              * all the way out to the cutoff, and that is it. For other forms
2222              * we need to calculate the constant shift up to the point where we
2223              * start modifying the potential.
2224              */
2225             ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2226
2227             r0   = ri0/scale;
2228             rc3  = r0*r0*r0;
2229             rc9  = rc3*rc3*rc3;
2230
2231             if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2232                 (fr->vdwtype == evdwSHIFT))
2233             {
2234                 /* Determine the constant energy shift below rvdw_switch.
2235                  * Table has a scale factor since we have scaled it down to compensate
2236                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2237                  */
2238                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2239                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2240             }
2241             else if (fr->vdw_modifier == eintmodPOTSHIFT)
2242             {
2243                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2244                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2245             }
2246
2247             /* Add the constant part from 0 to rvdw_switch.
2248              * This integration from 0 to rvdw_switch overcounts the number
2249              * of interactions by 1, as it also counts the self interaction.
2250              * We will correct for this later.
2251              */
2252             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2253             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2254
2255             /* Calculate the contribution in the range [r0,r1] where we
2256              * modify the potential. For a pure potential-shift modifier we will
2257              * have ri0==ri1, and there will not be any contribution here.
2258              */
2259             for (i = 0; i < 2; i++)
2260             {
2261                 enersum = 0;
2262                 virsum  = 0;
2263                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2264                 eners[i] -= enersum;
2265                 virs[i]  -= virsum;
2266             }
2267
2268             /* Alright: Above we compensated by REMOVING the parts outside r0
2269              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2270              *
2271              * Regardless of whether r0 is the point where we start switching,
2272              * or the cutoff where we calculated the constant shift, we include
2273              * all the parts we are missing out to infinity from r0 by
2274              * calculating the analytical dispersion correction.
2275              */
2276             eners[0] += -4.0*M_PI/(3.0*rc3);
2277             eners[1] +=  4.0*M_PI/(9.0*rc9);
2278             virs[0]  +=  8.0*M_PI/rc3;
2279             virs[1]  += -16.0*M_PI/(3.0*rc9);
2280         }
2281         else if (fr->vdwtype == evdwCUT ||
2282                  EVDW_PME(fr->vdwtype) ||
2283                  fr->vdwtype == evdwUSER)
2284         {
2285             if (fr->vdwtype == evdwUSER && fplog)
2286             {
2287                 fprintf(fplog,
2288                         "WARNING: using dispersion correction with user tables\n");
2289             }
2290
2291             /* Note that with LJ-PME, the dispersion correction is multiplied
2292              * by the difference between the actual C6 and the value of C6
2293              * that would produce the combination rule.
2294              * This means the normal energy and virial difference formulas
2295              * can be used here.
2296              */
2297
2298             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2299             rc9  = rc3*rc3*rc3;
2300             /* Contribution beyond the cut-off */
2301             eners[0] += -4.0*M_PI/(3.0*rc3);
2302             eners[1] +=  4.0*M_PI/(9.0*rc9);
2303             if (fr->vdw_modifier == eintmodPOTSHIFT)
2304             {
2305                 /* Contribution within the cut-off */
2306                 eners[0] += -4.0*M_PI/(3.0*rc3);
2307                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2308             }
2309             /* Contribution beyond the cut-off */
2310             virs[0]  +=  8.0*M_PI/rc3;
2311             virs[1]  += -16.0*M_PI/(3.0*rc9);
2312         }
2313         else
2314         {
2315             gmx_fatal(FARGS,
2316                       "Dispersion correction is not implemented for vdw-type = %s",
2317                       evdw_names[fr->vdwtype]);
2318         }
2319
2320         /* When we deprecate the group kernels the code below can go too */
2321         if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2322         {
2323             /* Calculate self-interaction coefficient (assuming that
2324              * the reciprocal-space contribution is constant in the
2325              * region that contributes to the self-interaction).
2326              */
2327             fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2328
2329             eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2330             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2331         }
2332
2333         fr->enerdiffsix    = eners[0];
2334         fr->enerdifftwelve = eners[1];
2335         /* The 0.5 is due to the Gromacs definition of the virial */
2336         fr->virdiffsix     = 0.5*virs[0];
2337         fr->virdifftwelve  = 0.5*virs[1];
2338     }
2339 }
2340
2341 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2342                    matrix box, real lambda, tensor pres, tensor virial,
2343                    real *prescorr, real *enercorr, real *dvdlcorr)
2344 {
2345     gmx_bool bCorrAll, bCorrPres;
2346     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2347     int      m;
2348
2349     *prescorr = 0;
2350     *enercorr = 0;
2351     *dvdlcorr = 0;
2352
2353     clear_mat(virial);
2354     clear_mat(pres);
2355
2356     if (ir->eDispCorr != edispcNO)
2357     {
2358         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2359                      ir->eDispCorr == edispcAllEnerPres);
2360         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2361                      ir->eDispCorr == edispcAllEnerPres);
2362
2363         invvol = 1/det(box);
2364         if (fr->n_tpi)
2365         {
2366             /* Only correct for the interactions with the inserted molecule */
2367             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2368             ninter = fr->n_tpi;
2369         }
2370         else
2371         {
2372             dens   = fr->numAtomsForDispersionCorrection*invvol;
2373             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2374         }
2375
2376         if (ir->efep == efepNO)
2377         {
2378             avcsix    = fr->avcsix[0];
2379             avctwelve = fr->avctwelve[0];
2380         }
2381         else
2382         {
2383             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2384             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2385         }
2386
2387         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2388         *enercorr += avcsix*enerdiff;
2389         dvdlambda  = 0.0;
2390         if (ir->efep != efepNO)
2391         {
2392             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2393         }
2394         if (bCorrAll)
2395         {
2396             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2397             *enercorr += avctwelve*enerdiff;
2398             if (fr->efep != efepNO)
2399             {
2400                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2401             }
2402         }
2403
2404         if (bCorrPres)
2405         {
2406             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2407             if (ir->eDispCorr == edispcAllEnerPres)
2408             {
2409                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2410             }
2411             /* The factor 2 is because of the Gromacs virial definition */
2412             spres = -2.0*invvol*svir*PRESFAC;
2413
2414             for (m = 0; m < DIM; m++)
2415             {
2416                 virial[m][m] += svir;
2417                 pres[m][m]   += spres;
2418             }
2419             *prescorr += spres;
2420         }
2421
2422         /* Can't currently control when it prints, for now, just print when degugging */
2423         if (debug)
2424         {
2425             if (bCorrAll)
2426             {
2427                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2428                         avcsix, avctwelve);
2429             }
2430             if (bCorrPres)
2431             {
2432                 fprintf(debug,
2433                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2434                         *enercorr, spres, svir);
2435             }
2436             else
2437             {
2438                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2439             }
2440         }
2441
2442         if (fr->efep != efepNO)
2443         {
2444             *dvdlcorr += dvdlambda;
2445         }
2446     }
2447 }
2448
2449 void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
2450                   t_graph *graph, rvec x[])
2451 {
2452     if (fplog)
2453     {
2454         fprintf(fplog, "Removing pbc first time\n");
2455     }
2456     calc_shifts(box, fr->shift_vec);
2457     if (graph)
2458     {
2459         mk_mshift(fplog, graph, fr->ePBC, box, x);
2460         if (gmx_debug_at)
2461         {
2462             p_graph(debug, "do_pbc_first 1", graph);
2463         }
2464         shift_self(graph, box, x);
2465         /* By doing an extra mk_mshift the molecules that are broken
2466          * because they were e.g. imported from another software
2467          * will be made whole again. Such are the healing powers
2468          * of GROMACS.
2469          */
2470         mk_mshift(fplog, graph, fr->ePBC, box, x);
2471         if (gmx_debug_at)
2472         {
2473             p_graph(debug, "do_pbc_first 2", graph);
2474         }
2475     }
2476     if (fplog)
2477     {
2478         fprintf(fplog, "Done rmpbc\n");
2479     }
2480 }
2481
2482 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2483                             const gmx_mtop_t *mtop, rvec x[],
2484                             gmx_bool bFirst)
2485 {
2486     t_graph        *graph;
2487     int             mb, as, mol;
2488     gmx_molblock_t *molb;
2489
2490     if (bFirst && fplog)
2491     {
2492         fprintf(fplog, "Removing pbc first time\n");
2493     }
2494
2495     snew(graph, 1);
2496     as = 0;
2497     for (mb = 0; mb < mtop->nmolblock; mb++)
2498     {
2499         molb = &mtop->molblock[mb];
2500         if (molb->natoms_mol == 1 ||
2501             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2502         {
2503             /* Just one atom or charge group in the molecule, no PBC required */
2504             as += molb->nmol*molb->natoms_mol;
2505         }
2506         else
2507         {
2508             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2509             mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
2510                            0, molb->natoms_mol, FALSE, FALSE, graph);
2511
2512             for (mol = 0; mol < molb->nmol; mol++)
2513             {
2514                 mk_mshift(fplog, graph, ePBC, box, x+as);
2515
2516                 shift_self(graph, box, x+as);
2517                 /* The molecule is whole now.
2518                  * We don't need the second mk_mshift call as in do_pbc_first,
2519                  * since we no longer need this graph.
2520                  */
2521
2522                 as += molb->natoms_mol;
2523             }
2524             done_graph(graph);
2525         }
2526     }
2527     sfree(graph);
2528 }
2529
2530 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2531                        const gmx_mtop_t *mtop, rvec x[])
2532 {
2533     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2534 }
2535
2536 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2537                  gmx_mtop_t *mtop, rvec x[])
2538 {
2539     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2540 }
2541
2542 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2543 {
2544     int t, nth;
2545     nth = gmx_omp_nthreads_get(emntDefault);
2546
2547 #pragma omp parallel for num_threads(nth) schedule(static)
2548     for (t = 0; t < nth; t++)
2549     {
2550         try
2551         {
2552             int offset, len;
2553
2554             offset = (natoms*t    )/nth;
2555             len    = (natoms*(t + 1))/nth - offset;
2556             put_atoms_in_box(ePBC, box, len, x + offset);
2557         }
2558         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2559     }
2560 }
2561
2562 // TODO This can be cleaned up a lot, and move back to runner.cpp
2563 void finish_run(FILE *fplog, t_commrec *cr,
2564                 t_inputrec *inputrec,
2565                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2566                 gmx_walltime_accounting_t walltime_accounting,
2567                 nonbonded_verlet_t *nbv,
2568                 gmx_bool bWriteStat)
2569 {
2570     t_nrnb *nrnb_tot = NULL;
2571     double  delta_t  = 0;
2572     double  nbfs     = 0, mflop = 0;
2573     double  elapsed_time,
2574             elapsed_time_over_all_ranks,
2575             elapsed_time_over_all_threads,
2576             elapsed_time_over_all_threads_over_all_ranks;
2577     /* Control whether it is valid to print a report. Only the
2578        simulation master may print, but it should not do so if the run
2579        terminated e.g. before a scheduled reset step. This is
2580        complicated by the fact that PME ranks are unaware of the
2581        reason why they were sent a pmerecvqxFINISH. To avoid
2582        communication deadlocks, we always do the communication for the
2583        report, even if we've decided not to write the report, because
2584        how long it takes to finish the run is not important when we've
2585        decided not to report on the simulation performance. */
2586     bool    printReport = SIMMASTER(cr);
2587
2588     if (!walltime_accounting_get_valid_finish(walltime_accounting))
2589     {
2590         md_print_warn(cr, fplog,
2591                       "Simulation ended prematurely, no performance report will be written.");
2592         printReport = false;
2593     }
2594
2595     if (cr->nnodes > 1)
2596     {
2597         snew(nrnb_tot, 1);
2598 #if GMX_MPI
2599         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2600                       cr->mpi_comm_mysim);
2601 #endif
2602     }
2603     else
2604     {
2605         nrnb_tot = nrnb;
2606     }
2607
2608     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2609     elapsed_time_over_all_ranks                  = elapsed_time;
2610     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2611     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2612 #if GMX_MPI
2613     if (cr->nnodes > 1)
2614     {
2615         /* reduce elapsed_time over all MPI ranks in the current simulation */
2616         MPI_Allreduce(&elapsed_time,
2617                       &elapsed_time_over_all_ranks,
2618                       1, MPI_DOUBLE, MPI_SUM,
2619                       cr->mpi_comm_mysim);
2620         elapsed_time_over_all_ranks /= cr->nnodes;
2621         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2622          * current simulation. */
2623         MPI_Allreduce(&elapsed_time_over_all_threads,
2624                       &elapsed_time_over_all_threads_over_all_ranks,
2625                       1, MPI_DOUBLE, MPI_SUM,
2626                       cr->mpi_comm_mysim);
2627     }
2628 #endif
2629
2630     if (printReport)
2631     {
2632         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2633     }
2634     if (cr->nnodes > 1)
2635     {
2636         sfree(nrnb_tot);
2637     }
2638
2639     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2640     {
2641         print_dd_statistics(cr, inputrec, fplog);
2642     }
2643
2644     /* TODO Move the responsibility for any scaling by thread counts
2645      * to the code that handled the thread region, so that there's a
2646      * mechanism to keep cycle counting working during the transition
2647      * to task parallelism. */
2648     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2649     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2650     wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2651     auto cycle_sum(wallcycle_sum(cr, wcycle));
2652
2653     if (printReport)
2654     {
2655         struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
2656
2657         wallcycle_print(fplog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2658                         elapsed_time_over_all_ranks,
2659                         wcycle, cycle_sum, gputimes);
2660
2661         if (EI_DYNAMICS(inputrec->eI))
2662         {
2663             delta_t = inputrec->delta_t;
2664         }
2665
2666         if (fplog)
2667         {
2668             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2669                        elapsed_time_over_all_ranks,
2670                        walltime_accounting_get_nsteps_done(walltime_accounting),
2671                        delta_t, nbfs, mflop);
2672         }
2673         if (bWriteStat)
2674         {
2675             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2676                        elapsed_time_over_all_ranks,
2677                        walltime_accounting_get_nsteps_done(walltime_accounting),
2678                        delta_t, nbfs, mflop);
2679         }
2680     }
2681 }
2682
2683 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
2684 {
2685     /* this function works, but could probably use a logic rewrite to keep all the different
2686        types of efep straight. */
2687
2688     int       i;
2689     t_lambda *fep = ir->fepvals;
2690
2691     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2692     {
2693         for (i = 0; i < efptNR; i++)
2694         {
2695             lambda[i] = 0.0;
2696             if (lam0)
2697             {
2698                 lam0[i] = 0.0;
2699             }
2700         }
2701         return;
2702     }
2703     else
2704     {
2705         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2706                                              if checkpoint is set -- a kludge is in for now
2707                                              to prevent this.*/
2708         for (i = 0; i < efptNR; i++)
2709         {
2710             /* overwrite lambda state with init_lambda for now for backwards compatibility */
2711             if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2712             {
2713                 lambda[i] = fep->init_lambda;
2714                 if (lam0)
2715                 {
2716                     lam0[i] = lambda[i];
2717                 }
2718             }
2719             else
2720             {
2721                 lambda[i] = fep->all_lambda[i][*fep_state];
2722                 if (lam0)
2723                 {
2724                     lam0[i] = lambda[i];
2725                 }
2726             }
2727         }
2728         if (ir->bSimTemp)
2729         {
2730             /* need to rescale control temperatures to match current state */
2731             for (i = 0; i < ir->opts.ngtc; i++)
2732             {
2733                 if (ir->opts.ref_t[i] > 0)
2734                 {
2735                     ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2736                 }
2737             }
2738         }
2739     }
2740
2741     /* Send to the log the information on the current lambdas */
2742     if (fplog != NULL)
2743     {
2744         fprintf(fplog, "Initial vector of lambda components:[ ");
2745         for (i = 0; i < efptNR; i++)
2746         {
2747             fprintf(fplog, "%10.4f ", lambda[i]);
2748         }
2749         fprintf(fplog, "]\n");
2750     }
2751     return;
2752 }
2753
2754
2755 void init_md(FILE *fplog,
2756              t_commrec *cr, t_inputrec *ir, const gmx_output_env_t *oenv,
2757              double *t, double *t0,
2758              real *lambda, int *fep_state, double *lam0,
2759              t_nrnb *nrnb, gmx_mtop_t *mtop,
2760              gmx_update_t **upd,
2761              int nfile, const t_filenm fnm[],
2762              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2763              tensor force_vir, tensor shake_vir, rvec mu_tot,
2764              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2765              gmx_wallcycle_t wcycle)
2766 {
2767     int  i;
2768
2769     /* Initial values */
2770     *t = *t0       = ir->init_t;
2771
2772     *bSimAnn = FALSE;
2773     for (i = 0; i < ir->opts.ngtc; i++)
2774     {
2775         /* set bSimAnn if any group is being annealed */
2776         if (ir->opts.annealing[i] != eannNO)
2777         {
2778             *bSimAnn = TRUE;
2779         }
2780     }
2781
2782     /* Initialize lambda variables */
2783     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2784
2785     // TODO upd is never NULL in practice, but the analysers don't know that
2786     if (upd)
2787     {
2788         *upd = init_update(ir);
2789     }
2790     if (*bSimAnn)
2791     {
2792         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : NULL);
2793     }
2794
2795     if (vcm != NULL)
2796     {
2797         *vcm = init_vcm(fplog, &mtop->groups, ir);
2798     }
2799
2800     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2801     {
2802         if (ir->etc == etcBERENDSEN)
2803         {
2804             please_cite(fplog, "Berendsen84a");
2805         }
2806         if (ir->etc == etcVRESCALE)
2807         {
2808             please_cite(fplog, "Bussi2007a");
2809         }
2810         if (ir->eI == eiSD1)
2811         {
2812             please_cite(fplog, "Goga2012");
2813         }
2814     }
2815     if ((ir->et[XX].n > 0) || (ir->et[YY].n > 0) || (ir->et[ZZ].n > 0))
2816     {
2817         please_cite(fplog, "Caleman2008a");
2818     }
2819     init_nrnb(nrnb);
2820
2821     if (nfile != -1)
2822     {
2823         *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv, wcycle);
2824
2825         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : mdoutf_get_fp_ene(*outf),
2826                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2827     }
2828
2829     /* Initiate variables */
2830     clear_mat(force_vir);
2831     clear_mat(shake_vir);
2832     clear_rvec(mu_tot);
2833 }