src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <stdio.h>
  44 #include <string.h>
  45
  46 #include <cmath>
  47 #include <cstdint>
  48
  49 #include <array>
  50
  51 #include "gromacs/awh/awh.h"
  52 #include "gromacs/domdec/dlbtiming.h"
  53 #include "gromacs/domdec/domdec.h"
  54 #include "gromacs/domdec/domdec_struct.h"
  55 #include "gromacs/essentialdynamics/edsam.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  63 #include "gromacs/gpu_utils/gpu_utils.h"
  64 #include "gromacs/imd/imd.h"
  65 #include "gromacs/listed-forces/bonded.h"
  66 #include "gromacs/listed-forces/disre.h"
  67 #include "gromacs/listed-forces/orires.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/units.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/math/vecdump.h"
  72 #include "gromacs/mdlib/calcmu.h"
  73 #include "gromacs/mdlib/constr.h"
  74 #include "gromacs/mdlib/force.h"
  75 #include "gromacs/mdlib/forcerec.h"
  76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  77 #include "gromacs/mdlib/mdrun.h"
  78 #include "gromacs/mdlib/nb_verlet.h"
  79 #include "gromacs/mdlib/nbnxn_atomdata.h"
  80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  81 #include "gromacs/mdlib/nbnxn_grid.h"
  82 #include "gromacs/mdlib/nbnxn_search.h"
  83 #include "gromacs/mdlib/qmmm.h"
  84 #include "gromacs/mdlib/update.h"
  85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  86 #include "gromacs/mdtypes/commrec.h"
  87 #include "gromacs/mdtypes/forceoutput.h"
  88 #include "gromacs/mdtypes/iforceprovider.h"
  89 #include "gromacs/mdtypes/inputrec.h"
  90 #include "gromacs/mdtypes/md_enums.h"
  91 #include "gromacs/mdtypes/state.h"
  92 #include "gromacs/pbcutil/ishift.h"
  93 #include "gromacs/pbcutil/mshift.h"
  94 #include "gromacs/pbcutil/pbc.h"
  95 #include "gromacs/pulling/pull.h"
  96 #include "gromacs/pulling/pull_rotation.h"
  97 #include "gromacs/timing/cyclecounter.h"
  98 #include "gromacs/timing/gpu_timing.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/wallcyclereporting.h"
 101 #include "gromacs/timing/walltime_accounting.h"
 102 #include "gromacs/topology/topology.h"
 103 #include "gromacs/utility/arrayref.h"
 104 #include "gromacs/utility/basedefinitions.h"
 105 #include "gromacs/utility/cstringutil.h"
 106 #include "gromacs/utility/exceptions.h"
 107 #include "gromacs/utility/fatalerror.h"
 108 #include "gromacs/utility/gmxassert.h"
 109 #include "gromacs/utility/gmxmpi.h"
 110 #include "gromacs/utility/logger.h"
 111 #include "gromacs/utility/pleasecite.h"
 112 #include "gromacs/utility/smalloc.h"
 113 #include "gromacs/utility/sysinfo.h"
 114
 115 #include "nbnxn_gpu.h"
 116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 118
 119 // TODO: this environment variable allows us to verify before release
 120 // that on less common architectures the total cost of polling is not larger than
 121 // a blocking wait (so polling does not introduce overhead when the static
 122 // PME-first ordering would suffice).
 123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 124
 125
 126 void print_time(FILE                     *out,
 127                 gmx_walltime_accounting_t walltime_accounting,
 128                 gmx_int64_t               step,
 129                 t_inputrec               *ir,
 130                 const t_commrec          *cr)
 131 {
 132     time_t finish;
 133     char   timebuf[STRLEN];
 134     double dt, elapsed_seconds, time_per_step;
 135     char   buf[48];
 136
 137 #if !GMX_THREAD_MPI
 138     if (!PAR(cr))
 139 #endif
 140     {
 141         fprintf(out, "\r");
 142     }
 143     fprintf(out, "step %s", gmx_step_str(step, buf));
 144     fflush(out);
 145
 146     if ((step >= ir->nstlist))
 147     {
 148         double seconds_since_epoch = gmx_gettime();
 149         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 150         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 151         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 152
 153         if (ir->nsteps >= 0)
 154         {
 155             if (dt >= 300)
 156             {
 157                 finish = (time_t) (seconds_since_epoch + dt);
 158                 gmx_ctime_r(&finish, timebuf, STRLEN);
 159                 sprintf(buf, "%s", timebuf);
 160                 buf[strlen(buf)-1] = '\0';
 161                 fprintf(out, ", will finish %s", buf);
 162             }
 163             else
 164             {
 165                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 166             }
 167         }
 168         else
 169         {
 170             fprintf(out, " performance: %.1f ns/day    ",
 171                     ir->delta_t/1000*24*60*60/time_per_step);
 172         }
 173     }
 174 #if !GMX_THREAD_MPI
 175     if (PAR(cr))
 176     {
 177         fprintf(out, "\n");
 178     }
 179 #else
 180     GMX_UNUSED_VALUE(cr);
 181 #endif
 182
 183     fflush(out);
 184 }
 185
 186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 187                          double the_time)
 188 {
 189     char   time_string[STRLEN];
 190
 191     if (!fplog)
 192     {
 193         return;
 194     }
 195
 196     {
 197         int    i;
 198         char   timebuf[STRLEN];
 199         time_t temp_time = (time_t) the_time;
 200
 201         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 202         for (i = 0; timebuf[i] >= ' '; i++)
 203         {
 204             time_string[i] = timebuf[i];
 205         }
 206         time_string[i] = '\0';
 207     }
 208
 209     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 210 }
 211
 212 void print_start(FILE *fplog, const t_commrec *cr,
 213                  gmx_walltime_accounting_t walltime_accounting,
 214                  const char *name)
 215 {
 216     char buf[STRLEN];
 217
 218     sprintf(buf, "Started %s", name);
 219     print_date_and_time(fplog, cr->nodeid, buf,
 220                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 221 }
 222
 223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 224 {
 225     const int end = forceToAdd.size();
 226
 227     // cppcheck-suppress unreadVariable
 228     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 229 #pragma omp parallel for num_threads(nt) schedule(static)
 230     for (int i = 0; i < end; i++)
 231     {
 232         rvec_inc(f[i], forceToAdd[i]);
 233     }
 234 }
 235
 236 static void pme_gpu_reduce_outputs(gmx_wallcycle_t            wcycle,
 237                                    ForceWithVirial           *forceWithVirial,
 238                                    ArrayRef<const gmx::RVec>  pmeForces,
 239                                    gmx_enerdata_t            *enerd,
 240                                    const tensor               vir_Q,
 241                                    real                       Vlr_q)
 242 {
 243     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 244     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 245     forceWithVirial->addVirialContribution(vir_Q);
 246     enerd->term[F_COUL_RECIP] += Vlr_q;
 247     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 248     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 249 }
 250
 251 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 252                         tensor vir_part, t_graph *graph, matrix box,
 253                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 254 {
 255     int    i;
 256
 257     /* The short-range virial from surrounding boxes */
 258     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 259     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 260
 261     /* Calculate partial virial, for local atoms only, based on short range.
 262      * Total virial is computed in global_stat, called from do_md
 263      */
 264     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 265     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 266
 267     /* Add wall contribution */
 268     for (i = 0; i < DIM; i++)
 269     {
 270         vir_part[i][ZZ] += fr->vir_wall_z[i];
 271     }
 272
 273     if (debug)
 274     {
 275         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 276     }
 277 }
 278
 279 static void pull_potential_wrapper(const t_commrec *cr,
 280                                    t_inputrec *ir,
 281                                    matrix box, rvec x[],
 282                                    ForceWithVirial *force,
 283                                    t_mdatoms *mdatoms,
 284                                    gmx_enerdata_t *enerd,
 285                                    real *lambda,
 286                                    double t,
 287                                    gmx_wallcycle_t wcycle)
 288 {
 289     t_pbc  pbc;
 290     real   dvdl;
 291
 292     /* Calculate the center of mass forces, this requires communication,
 293      * which is why pull_potential is called close to other communication.
 294      */
 295     wallcycle_start(wcycle, ewcPULLPOT);
 296     set_pbc(&pbc, ir->ePBC, box);
 297     dvdl                     = 0;
 298     enerd->term[F_COM_PULL] +=
 299         pull_potential(ir->pull_work, mdatoms, &pbc,
 300                        cr, t, lambda[efptRESTRAINT], x, force, &dvdl);
 301     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 302     wallcycle_stop(wcycle, ewcPULLPOT);
 303 }
 304
 305 static void pme_receive_force_ener(const t_commrec *cr,
 306                                    ForceWithVirial *forceWithVirial,
 307                                    gmx_enerdata_t  *enerd,
 308                                    gmx_wallcycle_t  wcycle)
 309 {
 310     real   e_q, e_lj, dvdl_q, dvdl_lj;
 311     float  cycles_ppdpme, cycles_seppme;
 312
 313     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 314     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 315
 316     /* In case of node-splitting, the PP nodes receive the long-range
 317      * forces, virial and energy from the PME nodes here.
 318      */
 319     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 320     dvdl_q  = 0;
 321     dvdl_lj = 0;
 322     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 323                       &cycles_seppme);
 324     enerd->term[F_COUL_RECIP] += e_q;
 325     enerd->term[F_LJ_RECIP]   += e_lj;
 326     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 327     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 328
 329     if (wcycle)
 330     {
 331         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 332     }
 333     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 334 }
 335
 336 static void print_large_forces(FILE *fp, t_mdatoms *md, const t_commrec *cr,
 337                                gmx_int64_t step, real forceTolerance,
 338                                const rvec *x, const rvec *f)
 339 {
 340     real           force2Tolerance = gmx::square(forceTolerance);
 341     std::uintmax_t numNonFinite    = 0;
 342     for (int i = 0; i < md->homenr; i++)
 343     {
 344         real force2    = norm2(f[i]);
 345         bool nonFinite = !std::isfinite(force2);
 346         if (force2 >= force2Tolerance || nonFinite)
 347         {
 348             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 349                     step,
 350                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 351         }
 352         if (nonFinite)
 353         {
 354             numNonFinite++;
 355         }
 356     }
 357     if (numNonFinite > 0)
 358     {
 359         /* Note that with MPI this fatal call on one rank might interrupt
 360          * the printing on other ranks. But we can only avoid that with
 361          * an expensive MPI barrier that we would need at each step.
 362          */
 363         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 364     }
 365 }
 366
 367 static void post_process_forces(const t_commrec *cr,
 368                                 gmx_int64_t step,
 369                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 370                                 gmx_localtop_t *top,
 371                                 matrix box, rvec x[],
 372                                 rvec f[],
 373                                 ForceWithVirial *forceWithVirial,
 374                                 tensor vir_force,
 375                                 t_mdatoms *mdatoms,
 376                                 t_graph *graph,
 377                                 t_forcerec *fr, gmx_vsite_t *vsite,
 378                                 int flags)
 379 {
 380     if (fr->haveDirectVirialContributions)
 381     {
 382         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 383
 384         if (vsite)
 385         {
 386             /* Spread the mesh force on virtual sites to the other particles...
 387              * This is parallellized. MPI communication is performed
 388              * if the constructing atoms aren't local.
 389              */
 390             matrix virial = { { 0 } };
 391             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 392                            (flags & GMX_FORCE_VIRIAL), virial,
 393                            nrnb,
 394                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 395             forceWithVirial->addVirialContribution(virial);
 396         }
 397
 398         if (flags & GMX_FORCE_VIRIAL)
 399         {
 400             /* Now add the forces, this is local */
 401             sum_forces(f, forceWithVirial->force_);
 402
 403             /* Add the direct virial contributions */
 404             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 405             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 406
 407             if (debug)
 408             {
 409                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 410             }
 411         }
 412     }
 413
 414     if (fr->print_force >= 0)
 415     {
 416         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 417     }
 418 }
 419
 420 static void do_nb_verlet(t_forcerec *fr,
 421                          interaction_const_t *ic,
 422                          gmx_enerdata_t *enerd,
 423                          int flags, int ilocality,
 424                          int clearF,
 425                          gmx_int64_t step,
 426                          t_nrnb *nrnb,
 427                          gmx_wallcycle_t wcycle)
 428 {
 429     if (!(flags & GMX_FORCE_NONBONDED))
 430     {
 431         /* skip non-bonded calculation */
 432         return;
 433     }
 434
 435     nonbonded_verlet_t       *nbv  = fr->nbv;
 436     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 437
 438     /* GPU kernel launch overhead is already timed separately */
 439     if (fr->cutoff_scheme != ecutsVERLET)
 440     {
 441         gmx_incons("Invalid cut-off scheme passed!");
 442     }
 443
 444     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 445
 446     if (!bUsingGpuKernels)
 447     {
 448         /* When dynamic pair-list  pruning is requested, we need to prune
 449          * at nstlistPrune steps.
 450          */
 451         if (nbv->listParams->useDynamicPruning &&
 452             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 453         {
 454             /* Prune the pair-list beyond fr->ic->rlistPrune using
 455              * the current coordinates of the atoms.
 456              */
 457             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 458             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 459             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 460         }
 461
 462         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 463     }
 464
 465     switch (nbvg->kernel_type)
 466     {
 467         case nbnxnk4x4_PlainC:
 468         case nbnxnk4xN_SIMD_4xN:
 469         case nbnxnk4xN_SIMD_2xNN:
 470             nbnxn_kernel_cpu(nbvg,
 471                              nbv->nbat,
 472                              ic,
 473                              fr->shift_vec,
 474                              flags,
 475                              clearF,
 476                              fr->fshift[0],
 477                              enerd->grpp.ener[egCOULSR],
 478                              fr->bBHAM ?
 479                              enerd->grpp.ener[egBHAMSR] :
 480                              enerd->grpp.ener[egLJSR]);
 481             break;
 482
 483         case nbnxnk8x8x8_GPU:
 484             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 485             break;
 486
 487         case nbnxnk8x8x8_PlainC:
 488             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 489                                  nbv->nbat, ic,
 490                                  fr->shift_vec,
 491                                  flags,
 492                                  clearF,
 493                                  nbv->nbat->out[0].f,
 494                                  fr->fshift[0],
 495                                  enerd->grpp.ener[egCOULSR],
 496                                  fr->bBHAM ?
 497                                  enerd->grpp.ener[egBHAMSR] :
 498                                  enerd->grpp.ener[egLJSR]);
 499             break;
 500
 501         default:
 502             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 503
 504     }
 505     if (!bUsingGpuKernels)
 506     {
 507         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 508     }
 509
 510     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 511     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 512     {
 513         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 514     }
 515     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 516              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 517     {
 518         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 519     }
 520     else
 521     {
 522         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 523     }
 524     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 525     if (flags & GMX_FORCE_ENERGY)
 526     {
 527         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 528         enr_nbnxn_kernel_ljc += 1;
 529         enr_nbnxn_kernel_lj  += 1;
 530     }
 531
 532     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 533              nbvg->nbl_lists.natpair_ljq);
 534     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 535              nbvg->nbl_lists.natpair_lj);
 536     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 537     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 538              nbvg->nbl_lists.natpair_q);
 539
 540     if (ic->vdw_modifier == eintmodFORCESWITCH)
 541     {
 542         /* We add up the switch cost separately */
 543         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 544                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 545     }
 546     if (ic->vdw_modifier == eintmodPOTSWITCH)
 547     {
 548         /* We add up the switch cost separately */
 549         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 550                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 551     }
 552     if (ic->vdwtype == evdwPME)
 553     {
 554         /* We add up the LJ Ewald cost separately */
 555         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 556                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 557     }
 558 }
 559
 560 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 561                              t_forcerec           *fr,
 562                              rvec                  x[],
 563                              rvec                  f[],
 564                              t_mdatoms            *mdatoms,
 565                              t_lambda             *fepvals,
 566                              real                 *lambda,
 567                              gmx_enerdata_t       *enerd,
 568                              int                   flags,
 569                              t_nrnb               *nrnb,
 570                              gmx_wallcycle_t       wcycle)
 571 {
 572     int              donb_flags;
 573     nb_kernel_data_t kernel_data;
 574     real             lam_i[efptNR];
 575     real             dvdl_nb[efptNR];
 576     int              th;
 577     int              i, j;
 578
 579     donb_flags = 0;
 580     /* Add short-range interactions */
 581     donb_flags |= GMX_NONBONDED_DO_SR;
 582
 583     /* Currently all group scheme kernels always calculate (shift-)forces */
 584     if (flags & GMX_FORCE_FORCES)
 585     {
 586         donb_flags |= GMX_NONBONDED_DO_FORCE;
 587     }
 588     if (flags & GMX_FORCE_VIRIAL)
 589     {
 590         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 591     }
 592     if (flags & GMX_FORCE_ENERGY)
 593     {
 594         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 595     }
 596
 597     kernel_data.flags  = donb_flags;
 598     kernel_data.lambda = lambda;
 599     kernel_data.dvdl   = dvdl_nb;
 600
 601     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 602     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 603
 604     /* reset free energy components */
 605     for (i = 0; i < efptNR; i++)
 606     {
 607         dvdl_nb[i]  = 0;
 608     }
 609
 610     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 611
 612     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 613 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 614     for (th = 0; th < nbl_lists->nnbl; th++)
 615     {
 616         try
 617         {
 618             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 619                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 620         }
 621         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 622     }
 623
 624     if (fepvals->sc_alpha != 0)
 625     {
 626         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 627         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 628     }
 629     else
 630     {
 631         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 632         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 633     }
 634
 635     /* If we do foreign lambda and we have soft-core interactions
 636      * we have to recalculate the (non-linear) energies contributions.
 637      */
 638     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 639     {
 640         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 641         kernel_data.lambda         = lam_i;
 642         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 643         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 644         /* Note that we add to kernel_data.dvdl, but ignore the result */
 645
 646         for (i = 0; i < enerd->n_lambda; i++)
 647         {
 648             for (j = 0; j < efptNR; j++)
 649             {
 650                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 651             }
 652             reset_foreign_enerdata(enerd);
 653 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 654             for (th = 0; th < nbl_lists->nnbl; th++)
 655             {
 656                 try
 657                 {
 658                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 659                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 660                 }
 661                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 662             }
 663
 664             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 665             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 666         }
 667     }
 668
 669     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 670 }
 671
 672 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 673 {
 674     return nbv != nullptr && nbv->bUseGPU;
 675 }
 676
 677 static inline void clear_rvecs_omp(int n, rvec v[])
 678 {
 679     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 680
 681     /* Note that we would like to avoid this conditional by putting it
 682      * into the omp pragma instead, but then we still take the full
 683      * omp parallel for overhead (at least with gcc5).
 684      */
 685     if (nth == 1)
 686     {
 687         for (int i = 0; i < n; i++)
 688         {
 689             clear_rvec(v[i]);
 690         }
 691     }
 692     else
 693     {
 694 #pragma omp parallel for num_threads(nth) schedule(static)
 695         for (int i = 0; i < n; i++)
 696         {
 697             clear_rvec(v[i]);
 698         }
 699     }
 700 }
 701
 702 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 703  *
 704  * \param groupOptions  Group options, containing T-coupling options
 705  */
 706 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 707 {
 708     real nrdfCoupled   = 0;
 709     real nrdfUncoupled = 0;
 710     real kineticEnergy = 0;
 711     for (int g = 0; g < groupOptions.ngtc; g++)
 712     {
 713         if (groupOptions.tau_t[g] >= 0)
 714         {
 715             nrdfCoupled   += groupOptions.nrdf[g];
 716             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 717         }
 718         else
 719         {
 720             nrdfUncoupled += groupOptions.nrdf[g];
 721         }
 722     }
 723
 724     /* This conditional with > also catches nrdf=0 */
 725     if (nrdfCoupled > nrdfUncoupled)
 726     {
 727         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 728     }
 729     else
 730     {
 731         return 0;
 732     }
 733 }
 734
 735 /*! \brief This routine checks that the potential energy is finite.
 736  *
 737  * Always checks that the potential energy is finite. If step equals
 738  * inputrec.init_step also checks that the magnitude of the potential energy
 739  * is reasonable. Terminates with a fatal error when a check fails.
 740  * Note that passing this check does not guarantee finite forces,
 741  * since those use slightly different arithmetics. But in most cases
 742  * there is just a narrow coordinate range where forces are not finite
 743  * and energies are finite.
 744  *
 745  * \param[in] step      The step number, used for checking and printing
 746  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 747  * \param[in] inputrec  The input record
 748  */
 749 static void checkPotentialEnergyValidity(gmx_int64_t           step,
 750                                          const gmx_enerdata_t &enerd,
 751                                          const t_inputrec     &inputrec)
 752 {
 753     /* Threshold valid for comparing absolute potential energy against
 754      * the kinetic energy. Normally one should not consider absolute
 755      * potential energy values, but with a factor of one million
 756      * we should never get false positives.
 757      */
 758     constexpr real c_thresholdFactor = 1e6;
 759
 760     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 761     real           averageKineticEnergy = 0;
 762     /* We only check for large potential energy at the initial step,
 763      * because that is by far the most likely step for this too occur
 764      * and because computing the average kinetic energy is not free.
 765      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 766      * before they become NaN.
 767      */
 768     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 769     {
 770         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 771     }
 772
 773     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 774                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 775     {
 776         gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 777                   step,
 778                   enerd.term[F_EPOT],
 779                   energyIsNotFinite ? "not finite" : "extremely high",
 780                   enerd.term[F_LJ],
 781                   enerd.term[F_COUL_SR],
 782                   energyIsNotFinite ? "non-finite" : "very high",
 783                   energyIsNotFinite ? " or Nan" : "");
 784     }
 785 }
 786
 787 /*! \brief Compute forces and/or energies for special algorithms
 788  *
 789  * The intention is to collect all calls to algorithms that compute
 790  * forces on local atoms only and that do not contribute to the local
 791  * virial sum (but add their virial contribution separately).
 792  * Eventually these should likely all become ForceProviders.
 793  * Within this function the intention is to have algorithms that do
 794  * global communication at the end, so global barriers within the MD loop
 795  * are as close together as possible.
 796  *
 797  * \param[in]     fplog            The log file
 798  * \param[in]     cr               The communication record
 799  * \param[in]     inputrec         The input record
 800  * \param[in]     step             The current MD step
 801  * \param[in]     t                The current time
 802  * \param[in,out] wcycle           Wallcycle accounting struct
 803  * \param[in,out] forceProviders   Pointer to a list of force providers
 804  * \param[in]     box              The unit cell
 805  * \param[in]     x                The coordinates
 806  * \param[in]     mdatoms          Per atom properties
 807  * \param[in]     lambda           Array of free-energy lambda values
 808  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 809  * \param[in,out] forceWithVirial  Force and virial buffers
 810  * \param[in,out] enerd            Energy buffer
 811  * \param[in,out] ed               Essential dynamics pointer
 812  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 813  *
 814  * \todo Remove bNS, which is used incorrectly.
 815  * \todo Convert all other algorithms called here to ForceProviders.
 816  */
 817 static void
 818 computeSpecialForces(FILE             *fplog,
 819                      const t_commrec  *cr,
 820                      t_inputrec       *inputrec,
 821                      gmx_int64_t       step,
 822                      double            t,
 823                      gmx_wallcycle_t   wcycle,
 824                      ForceProviders   *forceProviders,
 825                      matrix            box,
 826                      rvec             *x,
 827                      t_mdatoms        *mdatoms,
 828                      real             *lambda,
 829                      int               forceFlags,
 830                      ForceWithVirial  *forceWithVirial,
 831                      gmx_enerdata_t   *enerd,
 832                      gmx_edsam_t       ed,
 833                      gmx_bool          bNS)
 834 {
 835     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
 836
 837     /* NOTE: Currently all ForceProviders only provide forces.
 838      *       When they also provide energies, remove this conditional.
 839      */
 840     if (computeForces)
 841     {
 842         /* Collect forces from modules */
 843         forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
 844     }
 845
 846     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 847     {
 848         pull_potential_wrapper(cr, inputrec, box, x,
 849                                forceWithVirial,
 850                                mdatoms, enerd, lambda, t,
 851                                wcycle);
 852
 853         if (inputrec->bDoAwh)
 854         {
 855             Awh &awh = *inputrec->awh;
 856             enerd->term[F_COM_PULL] +=
 857                 awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 858                                                  forceWithVirial,
 859                                                  t, step, wcycle, fplog);
 860         }
 861     }
 862
 863     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 864
 865     /* Add the forces from enforced rotation potentials (if any) */
 866     if (inputrec->bRot)
 867     {
 868         wallcycle_start(wcycle, ewcROTadd);
 869         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
 870         wallcycle_stop(wcycle, ewcROTadd);
 871     }
 872
 873     if (ed)
 874     {
 875         /* Note that since init_edsam() is called after the initialization
 876          * of forcerec, edsam doesn't request the noVirSum force buffer.
 877          * Thus if no other algorithm (e.g. PME) requires it, the forces
 878          * here will contribute to the virial.
 879          */
 880         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
 881     }
 882
 883     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 884     if (inputrec->bIMD && computeForces)
 885     {
 886         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 887     }
 888 }
 889
 890 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 891  *
 892  * \param[in]  pmedata       The PME structure
 893  * \param[in]  box           The box matrix
 894  * \param[in]  x             Coordinate array
 895  * \param[in]  flags         Force flags
 896  * \param[in]  wcycle        The wallcycle structure
 897  */
 898 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 899                                       matrix          box,
 900                                       rvec            x[],
 901                                       int             flags,
 902                                       gmx_wallcycle_t wcycle)
 903 {
 904     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 905     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 906     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 907
 908     pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
 909     pme_gpu_launch_spread(pmedata, x, wcycle);
 910 }
 911
 912 /*! \brief Launch the FFT and gather stages of PME GPU
 913  *
 914  * This function only implements setting the output forces (no accumulation).
 915  *
 916  * \param[in]  pmedata        The PME structure
 917  * \param[in]  wcycle         The wallcycle structure
 918  */
 919 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 920                                      gmx_wallcycle_t   wcycle)
 921 {
 922     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 923     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 924 }
 925
 926 /*! \brief
 927  *  Polling wait for either of the PME or nonbonded GPU tasks.
 928  *
 929  * Instead of a static order in waiting for GPU tasks, this function
 930  * polls checking which of the two tasks completes first, and does the
 931  * associated force buffer reduction overlapped with the other task.
 932  * By doing that, unlike static scheduling order, it can always overlap
 933  * one of the reductions, regardless of the GPU task completion order.
 934  *
 935  * \param[in]     nbv              Nonbonded verlet structure
 936  * \param[in]     pmedata          PME module data
 937  * \param[in,out] force            Force array to reduce task outputs into.
 938  * \param[in,out] forceWithVirial  Force and virial buffers
 939  * \param[in,out] fshift           Shift force output vector results are reduced into
 940  * \param[in,out] enerd            Energy data structure results are reduced into
 941  * \param[in]     flags            Force flags
 942  * \param[in]     wcycle           The wallcycle structure
 943  */
 944 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
 945                                         const gmx_pme_t                *pmedata,
 946                                         gmx::PaddedArrayRef<gmx::RVec> *force,
 947                                         ForceWithVirial                *forceWithVirial,
 948                                         rvec                            fshift[],
 949                                         gmx_enerdata_t                 *enerd,
 950                                         int                             flags,
 951                                         gmx_wallcycle_t                 wcycle)
 952 {
 953     bool isPmeGpuDone = false;
 954     bool isNbGpuDone  = false;
 955
 956
 957     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 958
 959     while (!isPmeGpuDone || !isNbGpuDone)
 960     {
 961         if (!isPmeGpuDone)
 962         {
 963             matrix            vir_Q;
 964             real              Vlr_q;
 965
 966             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 967             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 968                                                    vir_Q, &Vlr_q, completionType);
 969
 970             if (isPmeGpuDone)
 971             {
 972                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 973                                        enerd, vir_Q, Vlr_q);
 974             }
 975         }
 976
 977         if (!isNbGpuDone)
 978         {
 979             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 980             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 981             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 982                                                     flags, eatLocal,
 983                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 984                                                     fshift, completionType);
 985             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 986             // To get the call count right, when the task finished we
 987             // issue a start/stop.
 988             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 989             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 990             if (isNbGpuDone)
 991             {
 992                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 993                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 994
 995                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
 996                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
 997             }
 998         }
 999     }
1000 }
1001
1002 /*! \brief
1003  *  Launch the dynamic rolling pruning GPU task.
1004  *
1005  *  We currently alternate local/non-local list pruning in odd-even steps
1006  *  (only pruning every second step without DD).
1007  *
1008  * \param[in]     cr               The communication record
1009  * \param[in]     nbv              Nonbonded verlet structure
1010  * \param[in]     inputrec         The input record
1011  * \param[in]     step             The current MD step
1012  */
1013 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1014                                            const nonbonded_verlet_t *nbv,
1015                                            const t_inputrec         *inputrec,
1016                                            const gmx_int64_t         step)
1017 {
1018     /* We should not launch the rolling pruning kernel at a search
1019      * step or just before search steps, since that's useless.
1020      * Without domain decomposition we prune at even steps.
1021      * With domain decomposition we alternate local and non-local
1022      * pruning at even and odd steps.
1023      */
1024     int  numRollingParts     = nbv->listParams->numRollingParts;
1025     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1026     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1027     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1028     if (stepWithCurrentList > 0 &&
1029         stepWithCurrentList < inputrec->nstlist - 1 &&
1030         (stepIsEven || DOMAINDECOMP(cr)))
1031     {
1032         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1033                                           stepIsEven ? eintLocal : eintNonlocal,
1034                                           numRollingParts);
1035     }
1036 }
1037
1038 static void do_force_cutsVERLET(FILE *fplog, const t_commrec *cr,
1039                                 const gmx_multisim_t *ms,
1040                                 t_inputrec *inputrec,
1041                                 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1042                                 gmx_localtop_t *top,
1043                                 gmx_groups_t gmx_unused *groups,
1044                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1045                                 gmx::PaddedArrayRef<gmx::RVec> force,
1046                                 tensor vir_force,
1047                                 t_mdatoms *mdatoms,
1048                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1049                                 real *lambda, t_graph *graph,
1050                                 t_forcerec *fr, interaction_const_t *ic,
1051                                 gmx_vsite_t *vsite, rvec mu_tot,
1052                                 double t, gmx_edsam_t ed,
1053                                 int flags,
1054                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1055                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1056 {
1057     int                 cg1, i, j;
1058     double              mu[2*DIM];
1059     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1060     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1061     rvec                vzero, box_diag;
1062     float               cycles_pme, cycles_wait_gpu;
1063     nonbonded_verlet_t *nbv = fr->nbv;
1064
1065     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1066     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1067     bFillGrid     = (bNS && bStateChanged);
1068     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1069     bDoForces     = (flags & GMX_FORCE_FORCES);
1070     bUseGPU       = fr->nbv->bUseGPU;
1071     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1072
1073     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1074     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1075     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1076         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1077
1078     /* At a search step we need to start the first balancing region
1079      * somewhere early inside the step after communication during domain
1080      * decomposition (and not during the previous step as usual).
1081      */
1082     if (bNS &&
1083         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1084     {
1085         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1086     }
1087
1088     cycles_wait_gpu = 0;
1089
1090     const int start  = 0;
1091     const int homenr = mdatoms->homenr;
1092
1093     clear_mat(vir_force);
1094
1095     if (DOMAINDECOMP(cr))
1096     {
1097         cg1 = cr->dd->ncg_tot;
1098     }
1099     else
1100     {
1101         cg1 = top->cgs.nr;
1102     }
1103     if (fr->n_tpi > 0)
1104     {
1105         cg1--;
1106     }
1107
1108     if (bStateChanged)
1109     {
1110         update_forcerec(fr, box);
1111
1112         if (inputrecNeedMutot(inputrec))
1113         {
1114             /* Calculate total (local) dipole moment in a temporary common array.
1115              * This makes it possible to sum them over nodes faster.
1116              */
1117             calc_mu(start, homenr,
1118                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1119                     mu, mu+DIM);
1120         }
1121     }
1122
1123     if (fr->ePBC != epbcNONE)
1124     {
1125         /* Compute shift vectors every step,
1126          * because of pressure coupling or box deformation!
1127          */
1128         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1129         {
1130             calc_shifts(box, fr->shift_vec);
1131         }
1132
1133         if (bCalcCGCM)
1134         {
1135             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1136             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1137         }
1138         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1139         {
1140             unshift_self(graph, box, as_rvec_array(x.data()));
1141         }
1142     }
1143
1144     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1145                                  fr->shift_vec, nbv->nbat);
1146
1147 #if GMX_MPI
1148     if (!thisRankHasDuty(cr, DUTY_PME))
1149     {
1150         /* Send particle coordinates to the pme nodes.
1151          * Since this is only implemented for domain decomposition
1152          * and domain decomposition does not use the graph,
1153          * we do not need to worry about shifting.
1154          */
1155         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1156                                  lambda[efptCOUL], lambda[efptVDW],
1157                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1158                                  step, wcycle);
1159     }
1160 #endif /* GMX_MPI */
1161
1162     if (useGpuPme)
1163     {
1164         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1165     }
1166
1167     /* do gridding for pair search */
1168     if (bNS)
1169     {
1170         if (graph && bStateChanged)
1171         {
1172             /* Calculate intramolecular shift vectors to make molecules whole */
1173             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1174         }
1175
1176         clear_rvec(vzero);
1177         box_diag[XX] = box[XX][XX];
1178         box_diag[YY] = box[YY][YY];
1179         box_diag[ZZ] = box[ZZ][ZZ];
1180
1181         wallcycle_start(wcycle, ewcNS);
1182         if (!fr->bDomDec)
1183         {
1184             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1185             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
1186                               0, vzero, box_diag,
1187                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1188                               0, nullptr,
1189                               nbv->grp[eintLocal].kernel_type,
1190                               nbv->nbat);
1191             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1192         }
1193         else
1194         {
1195             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1196             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
1197                                        fr->cginfo, as_rvec_array(x.data()),
1198                                        nbv->grp[eintNonlocal].kernel_type,
1199                                        nbv->nbat);
1200             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1201         }
1202
1203         nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
1204         wallcycle_stop(wcycle, ewcNS);
1205     }
1206
1207     /* initialize the GPU atom data and copy shift vector */
1208     if (bUseGPU)
1209     {
1210         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1211         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1212
1213         if (bNS)
1214         {
1215             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1216         }
1217
1218         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1219
1220         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1221         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1222     }
1223
1224     /* do local pair search */
1225     if (bNS)
1226     {
1227         wallcycle_start_nocount(wcycle, ewcNS);
1228         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1229         nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1230                             &top->excls,
1231                             nbv->listParams->rlistOuter,
1232                             nbv->min_ci_balanced,
1233                             &nbv->grp[eintLocal].nbl_lists,
1234                             eintLocal,
1235                             nbv->grp[eintLocal].kernel_type,
1236                             nrnb);
1237         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1238         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1239         {
1240             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1241         }
1242         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1243
1244         if (bUseGPU)
1245         {
1246             /* initialize local pair-list on the GPU */
1247             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1248                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1249                                     eintLocal);
1250         }
1251         wallcycle_stop(wcycle, ewcNS);
1252     }
1253     else
1254     {
1255         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
1256                                         nbv->nbat, wcycle);
1257     }
1258
1259     if (bUseGPU)
1260     {
1261         if (DOMAINDECOMP(cr))
1262         {
1263             ddOpenBalanceRegionGpu(cr->dd);
1264         }
1265
1266         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1267         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1268         /* launch local nonbonded work on GPU */
1269         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1270                      step, nrnb, wcycle);
1271         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1272         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1273     }
1274
1275     if (useGpuPme)
1276     {
1277         // In PME GPU and mixed mode we launch FFT / gather after the
1278         // X copy/transform to allow overlap as well as after the GPU NB
1279         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1280         // the nonbonded kernel.
1281         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1282     }
1283
1284     /* Communicate coordinates and sum dipole if necessary +
1285        do non-local pair search */
1286     if (DOMAINDECOMP(cr))
1287     {
1288         if (bNS)
1289         {
1290             wallcycle_start_nocount(wcycle, ewcNS);
1291             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1292
1293             nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1294                                 &top->excls,
1295                                 nbv->listParams->rlistOuter,
1296                                 nbv->min_ci_balanced,
1297                                 &nbv->grp[eintNonlocal].nbl_lists,
1298                                 eintNonlocal,
1299                                 nbv->grp[eintNonlocal].kernel_type,
1300                                 nrnb);
1301             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1302             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1303             {
1304                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1305             }
1306             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1307
1308             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1309             {
1310                 /* initialize non-local pair-list on the GPU */
1311                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1312                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1313                                         eintNonlocal);
1314             }
1315             wallcycle_stop(wcycle, ewcNS);
1316         }
1317         else
1318         {
1319             dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1320
1321             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
1322                                             nbv->nbat, wcycle);
1323         }
1324
1325         if (bUseGPU)
1326         {
1327             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1328             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1329             /* launch non-local nonbonded tasks on GPU */
1330             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1331                          step, nrnb, wcycle);
1332             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1333             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1334         }
1335     }
1336
1337     if (bUseGPU)
1338     {
1339         /* launch D2H copy-back F */
1340         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1341         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1342         if (DOMAINDECOMP(cr))
1343         {
1344             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1345                                      flags, eatNonlocal);
1346         }
1347         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1348                                  flags, eatLocal);
1349         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1351     }
1352
1353     if (bStateChanged && inputrecNeedMutot(inputrec))
1354     {
1355         if (PAR(cr))
1356         {
1357             gmx_sumd(2*DIM, mu, cr);
1358             ddReopenBalanceRegionCpu(cr->dd);
1359         }
1360
1361         for (i = 0; i < 2; i++)
1362         {
1363             for (j = 0; j < DIM; j++)
1364             {
1365                 fr->mu_tot[i][j] = mu[i*DIM + j];
1366             }
1367         }
1368     }
1369     if (fr->efep == efepNO)
1370     {
1371         copy_rvec(fr->mu_tot[0], mu_tot);
1372     }
1373     else
1374     {
1375         for (j = 0; j < DIM; j++)
1376         {
1377             mu_tot[j] =
1378                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1379                 lambda[efptCOUL]*fr->mu_tot[1][j];
1380         }
1381     }
1382
1383     /* Reset energies */
1384     reset_enerdata(enerd);
1385     clear_rvecs(SHIFTS, fr->fshift);
1386
1387     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1388     {
1389         wallcycle_start(wcycle, ewcPPDURINGPME);
1390         dd_force_flop_start(cr->dd, nrnb);
1391     }
1392
1393     if (inputrec->bRot)
1394     {
1395         wallcycle_start(wcycle, ewcROT);
1396         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1397         wallcycle_stop(wcycle, ewcROT);
1398     }
1399
1400     /* Temporary solution until all routines take PaddedRVecVector */
1401     rvec *f = as_rvec_array(force.data());
1402
1403     /* Start the force cycle counter.
1404      * Note that a different counter is used for dynamic load balancing.
1405      */
1406     wallcycle_start(wcycle, ewcFORCE);
1407
1408     gmx::ArrayRef<gmx::RVec> forceRef = force;
1409     if (bDoForces)
1410     {
1411         /* If we need to compute the virial, we might need a separate
1412          * force buffer for algorithms for which the virial is calculated
1413          * directly, such as PME.
1414          */
1415         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1416         {
1417             forceRef = *fr->forceBufferForDirectVirialContributions;
1418
1419             /* We only compute forces on local atoms. Note that vsites can
1420              * spread to non-local atoms, but that part of the buffer is
1421              * cleared separately in the vsite spreading code.
1422              */
1423             clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1424         }
1425         /* Clear the short- and long-range forces */
1426         clear_rvecs_omp(fr->natoms_force_constr, f);
1427     }
1428
1429     /* forceWithVirial uses the local atom range only */
1430     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1431
1432     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1433     {
1434         clear_pull_forces(inputrec->pull_work);
1435     }
1436
1437     /* We calculate the non-bonded forces, when done on the CPU, here.
1438      * We do this before calling do_force_lowlevel, because in that
1439      * function, the listed forces are calculated before PME, which
1440      * does communication.  With this order, non-bonded and listed
1441      * force calculation imbalance can be balanced out by the domain
1442      * decomposition load balancing.
1443      */
1444
1445     if (!bUseOrEmulGPU)
1446     {
1447         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1448                      step, nrnb, wcycle);
1449     }
1450
1451     if (fr->efep != efepNO)
1452     {
1453         /* Calculate the local and non-local free energy interactions here.
1454          * Happens here on the CPU both with and without GPU.
1455          */
1456         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1457         {
1458             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1459                              fr, as_rvec_array(x.data()), f, mdatoms,
1460                              inputrec->fepvals, lambda,
1461                              enerd, flags, nrnb, wcycle);
1462         }
1463
1464         if (DOMAINDECOMP(cr) &&
1465             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1466         {
1467             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1468                              fr, as_rvec_array(x.data()), f, mdatoms,
1469                              inputrec->fepvals, lambda,
1470                              enerd, flags, nrnb, wcycle);
1471         }
1472     }
1473
1474     if (!bUseOrEmulGPU)
1475     {
1476         int aloc;
1477
1478         if (DOMAINDECOMP(cr))
1479         {
1480             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1481                          step, nrnb, wcycle);
1482         }
1483
1484         if (!bUseOrEmulGPU)
1485         {
1486             aloc = eintLocal;
1487         }
1488         else
1489         {
1490             aloc = eintNonlocal;
1491         }
1492
1493         /* Add all the non-bonded force to the normal force array.
1494          * This can be split into a local and a non-local part when overlapping
1495          * communication with calculation with domain decomposition.
1496          */
1497         wallcycle_stop(wcycle, ewcFORCE);
1498
1499         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f, wcycle);
1500
1501         wallcycle_start_nocount(wcycle, ewcFORCE);
1502
1503         /* if there are multiple fshift output buffers reduce them */
1504         if ((flags & GMX_FORCE_VIRIAL) &&
1505             nbv->grp[aloc].nbl_lists.nnbl > 1)
1506         {
1507             /* This is not in a subcounter because it takes a
1508                negligible and constant-sized amount of time */
1509             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1510                                                      fr->fshift);
1511         }
1512     }
1513
1514     /* update QMMMrec, if necessary */
1515     if (fr->bQMMM)
1516     {
1517         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1518     }
1519
1520     /* Compute the bonded and non-bonded energies and optionally forces */
1521     do_force_lowlevel(fr, inputrec, &(top->idef),
1522                       cr, ms, nrnb, wcycle, mdatoms,
1523                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1524                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1525                       flags, &cycles_pme);
1526
1527     wallcycle_stop(wcycle, ewcFORCE);
1528
1529     computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1530                          fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
1531                          flags, &forceWithVirial, enerd,
1532                          ed, bNS);
1533
1534     if (bUseOrEmulGPU)
1535     {
1536         /* wait for non-local forces (or calculate in emulation mode) */
1537         if (DOMAINDECOMP(cr))
1538         {
1539             if (bUseGPU)
1540             {
1541                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1542                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1543                                            flags, eatNonlocal,
1544                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1545                                            fr->fshift);
1546                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1547             }
1548             else
1549             {
1550                 wallcycle_start_nocount(wcycle, ewcFORCE);
1551                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1552                              step, nrnb, wcycle);
1553                 wallcycle_stop(wcycle, ewcFORCE);
1554             }
1555
1556             /* skip the reduction if there was no non-local work to do */
1557             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1558             {
1559                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1560                                                nbv->nbat, f, wcycle);
1561             }
1562         }
1563     }
1564
1565     if (DOMAINDECOMP(cr))
1566     {
1567         /* We are done with the CPU compute.
1568          * We will now communicate the non-local forces.
1569          * If we use a GPU this will overlap with GPU work, so in that case
1570          * we do not close the DD force balancing region here.
1571          */
1572         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1573         {
1574             ddCloseBalanceRegionCpu(cr->dd);
1575         }
1576         if (bDoForces)
1577         {
1578             dd_move_f(cr->dd, f, fr->fshift, wcycle);
1579         }
1580     }
1581
1582     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1583     // an alternating wait/reduction scheme.
1584     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1585     if (alternateGpuWait)
1586     {
1587         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1588     }
1589
1590     if (!alternateGpuWait && useGpuPme)
1591     {
1592         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1593         matrix vir_Q;
1594         real   Vlr_q;
1595         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1596         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1597     }
1598
1599     /* Wait for local GPU NB outputs on the non-alternating wait path */
1600     if (!alternateGpuWait && bUseGPU)
1601     {
1602         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1603          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1604          * but even with a step of 0.1 ms the difference is less than 1%
1605          * of the step time.
1606          */
1607         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1608
1609         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1610         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1611                                    flags, eatLocal,
1612                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1613                                    fr->fshift);
1614         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1615
1616         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1617         {
1618             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1619             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1620             {
1621                 /* We measured few cycles, it could be that the kernel
1622                  * and transfer finished earlier and there was no actual
1623                  * wait time, only API call overhead.
1624                  * Then the actual time could be anywhere between 0 and
1625                  * cycles_wait_est. We will use half of cycles_wait_est.
1626                  */
1627                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1628             }
1629             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1630         }
1631     }
1632
1633     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1634     {
1635         // NOTE: emulation kernel is not included in the balancing region,
1636         // but emulation mode does not target performance anyway
1637         wallcycle_start_nocount(wcycle, ewcFORCE);
1638         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1639                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1640                      step, nrnb, wcycle);
1641         wallcycle_stop(wcycle, ewcFORCE);
1642     }
1643
1644     if (bUseGPU)
1645     {
1646         /* now clear the GPU outputs while we finish the step on the CPU */
1647         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1648         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1649         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1650
1651         /* Is dynamic pair-list pruning activated? */
1652         if (nbv->listParams->useDynamicPruning)
1653         {
1654             launchGpuRollingPruning(cr, nbv, inputrec, step);
1655         }
1656         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1657         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1658
1659         // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
1660     }
1661
1662     /* Do the nonbonded GPU (or emulation) force buffer reduction
1663      * on the non-alternating path. */
1664     if (bUseOrEmulGPU && !alternateGpuWait)
1665     {
1666         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1667                                        nbv->nbat, f, wcycle);
1668     }
1669
1670     if (DOMAINDECOMP(cr))
1671     {
1672         dd_force_flop_stop(cr->dd, nrnb);
1673     }
1674
1675     if (bDoForces)
1676     {
1677         /* If we have NoVirSum forces, but we do not calculate the virial,
1678          * we sum fr->f_novirsum=f later.
1679          */
1680         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1681         {
1682             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1683                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1684         }
1685
1686         if (flags & GMX_FORCE_VIRIAL)
1687         {
1688             /* Calculation of the virial must be done after vsites! */
1689             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1690                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1691         }
1692     }
1693
1694     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1695     {
1696         /* In case of node-splitting, the PP nodes receive the long-range
1697          * forces, virial and energy from the PME nodes here.
1698          */
1699         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1700     }
1701
1702     if (bDoForces)
1703     {
1704         post_process_forces(cr, step, nrnb, wcycle,
1705                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1706                             vir_force, mdatoms, graph, fr, vsite,
1707                             flags);
1708     }
1709
1710     if (flags & GMX_FORCE_ENERGY)
1711     {
1712         /* Sum the potential energy terms from group contributions */
1713         sum_epot(&(enerd->grpp), enerd->term);
1714
1715         if (!EI_TPI(inputrec->eI))
1716         {
1717             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1718         }
1719     }
1720 }
1721
1722 static void do_force_cutsGROUP(FILE *fplog, const t_commrec *cr,
1723                                const gmx_multisim_t *ms,
1724                                t_inputrec *inputrec,
1725                                gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1726                                gmx_localtop_t *top,
1727                                gmx_groups_t *groups,
1728                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
1729                                gmx::PaddedArrayRef<gmx::RVec> force,
1730                                tensor vir_force,
1731                                t_mdatoms *mdatoms,
1732                                gmx_enerdata_t *enerd, t_fcdata *fcd,
1733                                real *lambda, t_graph *graph,
1734                                t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1735                                double t, gmx_edsam_t ed,
1736                                int flags,
1737                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1738                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1739 {
1740     int        cg0, cg1, i, j;
1741     double     mu[2*DIM];
1742     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1743     gmx_bool   bDoForces;
1744     float      cycles_pme;
1745
1746     const int  start  = 0;
1747     const int  homenr = mdatoms->homenr;
1748
1749     clear_mat(vir_force);
1750
1751     cg0 = 0;
1752     if (DOMAINDECOMP(cr))
1753     {
1754         cg1 = cr->dd->ncg_tot;
1755     }
1756     else
1757     {
1758         cg1 = top->cgs.nr;
1759     }
1760     if (fr->n_tpi > 0)
1761     {
1762         cg1--;
1763     }
1764
1765     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1766     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1767     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1768     bFillGrid      = (bNS && bStateChanged);
1769     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1770     bDoForces      = (flags & GMX_FORCE_FORCES);
1771
1772     if (bStateChanged)
1773     {
1774         update_forcerec(fr, box);
1775
1776         if (inputrecNeedMutot(inputrec))
1777         {
1778             /* Calculate total (local) dipole moment in a temporary common array.
1779              * This makes it possible to sum them over nodes faster.
1780              */
1781             calc_mu(start, homenr,
1782                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1783                     mu, mu+DIM);
1784         }
1785     }
1786
1787     if (fr->ePBC != epbcNONE)
1788     {
1789         /* Compute shift vectors every step,
1790          * because of pressure coupling or box deformation!
1791          */
1792         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1793         {
1794             calc_shifts(box, fr->shift_vec);
1795         }
1796
1797         if (bCalcCGCM)
1798         {
1799             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1800                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1801             inc_nrnb(nrnb, eNR_CGCM, homenr);
1802             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1803         }
1804         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1805         {
1806             unshift_self(graph, box, as_rvec_array(x.data()));
1807         }
1808     }
1809     else if (bCalcCGCM)
1810     {
1811         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1812         inc_nrnb(nrnb, eNR_CGCM, homenr);
1813     }
1814
1815     if (bCalcCGCM && gmx_debug_at)
1816     {
1817         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1818     }
1819
1820 #if GMX_MPI
1821     if (!thisRankHasDuty(cr, DUTY_PME))
1822     {
1823         /* Send particle coordinates to the pme nodes.
1824          * Since this is only implemented for domain decomposition
1825          * and domain decomposition does not use the graph,
1826          * we do not need to worry about shifting.
1827          */
1828         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1829                                  lambda[efptCOUL], lambda[efptVDW],
1830                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1831                                  step, wcycle);
1832     }
1833 #endif /* GMX_MPI */
1834
1835     /* Communicate coordinates and sum dipole if necessary */
1836     if (DOMAINDECOMP(cr))
1837     {
1838         dd_move_x(cr->dd, box, as_rvec_array(x.data()), wcycle);
1839
1840         /* No GPU support, no move_x overlap, so reopen the balance region here */
1841         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1842         {
1843             ddReopenBalanceRegionCpu(cr->dd);
1844         }
1845     }
1846
1847     if (inputrecNeedMutot(inputrec))
1848     {
1849         if (bStateChanged)
1850         {
1851             if (PAR(cr))
1852             {
1853                 gmx_sumd(2*DIM, mu, cr);
1854                 ddReopenBalanceRegionCpu(cr->dd);
1855             }
1856             for (i = 0; i < 2; i++)
1857             {
1858                 for (j = 0; j < DIM; j++)
1859                 {
1860                     fr->mu_tot[i][j] = mu[i*DIM + j];
1861                 }
1862             }
1863         }
1864         if (fr->efep == efepNO)
1865         {
1866             copy_rvec(fr->mu_tot[0], mu_tot);
1867         }
1868         else
1869         {
1870             for (j = 0; j < DIM; j++)
1871             {
1872                 mu_tot[j] =
1873                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1874             }
1875         }
1876     }
1877
1878     /* Reset energies */
1879     reset_enerdata(enerd);
1880     clear_rvecs(SHIFTS, fr->fshift);
1881
1882     if (bNS)
1883     {
1884         wallcycle_start(wcycle, ewcNS);
1885
1886         if (graph && bStateChanged)
1887         {
1888             /* Calculate intramolecular shift vectors to make molecules whole */
1889             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1890         }
1891
1892         /* Do the actual neighbour searching */
1893         ns(fplog, fr, box,
1894            groups, top, mdatoms,
1895            cr, nrnb, bFillGrid);
1896
1897         wallcycle_stop(wcycle, ewcNS);
1898     }
1899
1900     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1901     {
1902         wallcycle_start(wcycle, ewcPPDURINGPME);
1903         dd_force_flop_start(cr->dd, nrnb);
1904     }
1905
1906     if (inputrec->bRot)
1907     {
1908         wallcycle_start(wcycle, ewcROT);
1909         do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1910         wallcycle_stop(wcycle, ewcROT);
1911     }
1912
1913     /* Temporary solution until all routines take PaddedRVecVector */
1914     rvec *f = as_rvec_array(force.data());
1915
1916     /* Start the force cycle counter.
1917      * Note that a different counter is used for dynamic load balancing.
1918      */
1919     wallcycle_start(wcycle, ewcFORCE);
1920
1921     gmx::ArrayRef<gmx::RVec> forceRef = force;
1922     if (bDoForces)
1923     {
1924         /* If we need to compute the virial, we might need a separate
1925          * force buffer for algorithms for which the virial is calculated
1926          * separately, such as PME.
1927          */
1928         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1929         {
1930             forceRef = *fr->forceBufferForDirectVirialContributions;
1931
1932             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1933         }
1934
1935         /* Clear the short- and long-range forces */
1936         clear_rvecs(fr->natoms_force_constr, f);
1937     }
1938
1939     /* forceWithVirial might need the full force atom range */
1940     ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1941
1942     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1943     {
1944         clear_pull_forces(inputrec->pull_work);
1945     }
1946
1947     /* update QMMMrec, if necessary */
1948     if (fr->bQMMM)
1949     {
1950         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1951     }
1952
1953     /* Compute the bonded and non-bonded energies and optionally forces */
1954     do_force_lowlevel(fr, inputrec, &(top->idef),
1955                       cr, ms, nrnb, wcycle, mdatoms,
1956                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1957                       box, inputrec->fepvals, lambda,
1958                       graph, &(top->excls), fr->mu_tot,
1959                       flags,
1960                       &cycles_pme);
1961
1962     wallcycle_stop(wcycle, ewcFORCE);
1963
1964     if (DOMAINDECOMP(cr))
1965     {
1966         dd_force_flop_stop(cr->dd, nrnb);
1967
1968         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1969         {
1970             ddCloseBalanceRegionCpu(cr->dd);
1971         }
1972     }
1973
1974     computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
1975                          fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
1976                          flags, &forceWithVirial, enerd,
1977                          ed, bNS);
1978
1979     if (bDoForces)
1980     {
1981         /* Communicate the forces */
1982         if (DOMAINDECOMP(cr))
1983         {
1984             dd_move_f(cr->dd, f, fr->fshift, wcycle);
1985             /* Do we need to communicate the separate force array
1986              * for terms that do not contribute to the single sum virial?
1987              * Position restraints and electric fields do not introduce
1988              * inter-cg forces, only full electrostatics methods do.
1989              * When we do not calculate the virial, fr->f_novirsum = f,
1990              * so we have already communicated these forces.
1991              */
1992             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1993                 (flags & GMX_FORCE_VIRIAL))
1994             {
1995                 dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr, wcycle);
1996             }
1997         }
1998
1999         /* If we have NoVirSum forces, but we do not calculate the virial,
2000          * we sum fr->f_novirsum=f later.
2001          */
2002         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2003         {
2004             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2005                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2006         }
2007
2008         if (flags & GMX_FORCE_VIRIAL)
2009         {
2010             /* Calculation of the virial must be done after vsites! */
2011             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2012                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2013         }
2014     }
2015
2016     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2017     {
2018         /* In case of node-splitting, the PP nodes receive the long-range
2019          * forces, virial and energy from the PME nodes here.
2020          */
2021         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2022     }
2023
2024     if (bDoForces)
2025     {
2026         post_process_forces(cr, step, nrnb, wcycle,
2027                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2028                             vir_force, mdatoms, graph, fr, vsite,
2029                             flags);
2030     }
2031
2032     if (flags & GMX_FORCE_ENERGY)
2033     {
2034         /* Sum the potential energy terms from group contributions */
2035         sum_epot(&(enerd->grpp), enerd->term);
2036
2037         if (!EI_TPI(inputrec->eI))
2038         {
2039             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2040         }
2041     }
2042
2043 }
2044
2045 void do_force(FILE *fplog, const t_commrec *cr,
2046               const gmx_multisim_t *ms,
2047               t_inputrec *inputrec,
2048               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2049               gmx_localtop_t *top,
2050               gmx_groups_t *groups,
2051               matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
2052               gmx::PaddedArrayRef<gmx::RVec> force,
2053               tensor vir_force,
2054               t_mdatoms *mdatoms,
2055               gmx_enerdata_t *enerd, t_fcdata *fcd,
2056               gmx::ArrayRef<real> lambda, t_graph *graph,
2057               t_forcerec *fr,
2058               gmx_vsite_t *vsite, rvec mu_tot,
2059               double t, gmx_edsam_t ed,
2060               int flags,
2061               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2062               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2063 {
2064     /* modify force flag if not doing nonbonded */
2065     if (!fr->bNonbonded)
2066     {
2067         flags &= ~GMX_FORCE_NONBONDED;
2068     }
2069
2070     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2071     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2072
2073     switch (inputrec->cutoff_scheme)
2074     {
2075         case ecutsVERLET:
2076             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2077                                 step, nrnb, wcycle,
2078                                 top,
2079                                 groups,
2080                                 box, x, hist,
2081                                 force, vir_force,
2082                                 mdatoms,
2083                                 enerd, fcd,
2084                                 lambda.data(), graph,
2085                                 fr, fr->ic,
2086                                 vsite, mu_tot,
2087                                 t, ed,
2088                                 flags,
2089                                 ddOpenBalanceRegion,
2090                                 ddCloseBalanceRegion);
2091             break;
2092         case ecutsGROUP:
2093             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2094                                step, nrnb, wcycle,
2095                                top,
2096                                groups,
2097                                box, x, hist,
2098                                force, vir_force,
2099                                mdatoms,
2100                                enerd, fcd,
2101                                lambda.data(), graph,
2102                                fr, vsite, mu_tot,
2103                                t, ed,
2104                                flags,
2105                                ddOpenBalanceRegion,
2106                                ddCloseBalanceRegion);
2107             break;
2108         default:
2109             gmx_incons("Invalid cut-off scheme passed!");
2110     }
2111
2112     /* In case we don't have constraints and are using GPUs, the next balancing
2113      * region starts here.
2114      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2115      * virial calculation and COM pulling, is not thus not included in
2116      * the balance timing, which is ok as most tasks do communication.
2117      */
2118     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2119     {
2120         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2121     }
2122 }
2123
2124
2125 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
2126                         t_inputrec *ir, t_mdatoms *md,
2127                         t_state *state, const t_commrec *cr,
2128                         const gmx_multisim_t *ms,
2129                         t_nrnb *nrnb,
2130                         t_forcerec *fr, gmx_localtop_t *top)
2131 {
2132     int             i, m, start, end;
2133     gmx_int64_t     step;
2134     real            dt = ir->delta_t;
2135     real            dvdl_dum;
2136     rvec           *savex;
2137
2138     /* We need to allocate one element extra, since we might use
2139      * (unaligned) 4-wide SIMD loads to access rvec entries.
2140      */
2141     snew(savex, state->natoms + 1);
2142
2143     start = 0;
2144     end   = md->homenr;
2145
2146     if (debug)
2147     {
2148         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2149                 start, md->homenr, end);
2150     }
2151     /* Do a first constrain to reset particles... */
2152     step = ir->init_step;
2153     if (fplog)
2154     {
2155         char buf[STEPSTRSIZE];
2156         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2157                 gmx_step_str(step, buf));
2158     }
2159     dvdl_dum = 0;
2160
2161     /* constrain the current position */
2162     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2163               ir, cr, ms, step, 0, 1.0, md,
2164               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2165               fr->bMolPBC, state->box,
2166               state->lambda[efptBONDED], &dvdl_dum,
2167               nullptr, nullptr, nrnb, econqCoord);
2168     if (EI_VV(ir->eI))
2169     {
2170         /* constrain the inital velocity, and save it */
2171         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2172         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2173                   ir, cr, ms, step, 0, 1.0, md,
2174                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2175                   fr->bMolPBC, state->box,
2176                   state->lambda[efptBONDED], &dvdl_dum,
2177                   nullptr, nullptr, nrnb, econqVeloc);
2178     }
2179     /* constrain the inital velocities at t-dt/2 */
2180     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2181     {
2182         for (i = start; (i < end); i++)
2183         {
2184             for (m = 0; (m < DIM); m++)
2185             {
2186                 /* Reverse the velocity */
2187                 state->v[i][m] = -state->v[i][m];
2188                 /* Store the position at t-dt in buf */
2189                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2190             }
2191         }
2192         /* Shake the positions at t=-dt with the positions at t=0
2193          * as reference coordinates.
2194          */
2195         if (fplog)
2196         {
2197             char buf[STEPSTRSIZE];
2198             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2199                     gmx_step_str(step, buf));
2200         }
2201         dvdl_dum = 0;
2202         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2203                   ir, cr, ms, step, -1, 1.0, md,
2204                   as_rvec_array(state->x.data()), savex, nullptr,
2205                   fr->bMolPBC, state->box,
2206                   state->lambda[efptBONDED], &dvdl_dum,
2207                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2208
2209         for (i = start; i < end; i++)
2210         {
2211             for (m = 0; m < DIM; m++)
2212             {
2213                 /* Re-reverse the velocities */
2214                 state->v[i][m] = -state->v[i][m];
2215             }
2216         }
2217     }
2218     sfree(savex);
2219 }
2220
2221
2222 static void
2223 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2224                 double *enerout, double *virout)
2225 {
2226     double enersum, virsum;
2227     double invscale, invscale2, invscale3;
2228     double r, ea, eb, ec, pa, pb, pc, pd;
2229     double y0, f, g, h;
2230     int    ri, offset;
2231     double tabfactor;
2232
2233     invscale  = 1.0/scale;
2234     invscale2 = invscale*invscale;
2235     invscale3 = invscale*invscale2;
2236
2237     /* Following summation derived from cubic spline definition,
2238      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2239      * the cubic spline.  We first calculate the negative of the
2240      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2241      * add the more standard, abrupt cutoff correction to that result,
2242      * yielding the long-range correction for a switched function.  We
2243      * perform both the pressure and energy loops at the same time for
2244      * simplicity, as the computational cost is low. */
2245
2246     if (offstart == 0)
2247     {
2248         /* Since the dispersion table has been scaled down a factor
2249          * 6.0 and the repulsion a factor 12.0 to compensate for the
2250          * c6/c12 parameters inside nbfp[] being scaled up (to save
2251          * flops in kernels), we need to correct for this.
2252          */
2253         tabfactor = 6.0;
2254     }
2255     else
2256     {
2257         tabfactor = 12.0;
2258     }
2259
2260     enersum = 0.0;
2261     virsum  = 0.0;
2262     for (ri = rstart; ri < rend; ++ri)
2263     {
2264         r  = ri*invscale;
2265         ea = invscale3;
2266         eb = 2.0*invscale2*r;
2267         ec = invscale*r*r;
2268
2269         pa = invscale3;
2270         pb = 3.0*invscale2*r;
2271         pc = 3.0*invscale*r*r;
2272         pd = r*r*r;
2273
2274         /* this "8" is from the packing in the vdwtab array - perhaps
2275            should be defined? */
2276
2277         offset = 8*ri + offstart;
2278         y0     = vdwtab[offset];
2279         f      = vdwtab[offset+1];
2280         g      = vdwtab[offset+2];
2281         h      = vdwtab[offset+3];
2282
2283         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2284         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2285     }
2286     *enerout = 4.0*M_PI*enersum*tabfactor;
2287     *virout  = 4.0*M_PI*virsum*tabfactor;
2288 }
2289
2290 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2291 {
2292     double   eners[2], virs[2], enersum, virsum;
2293     double   r0, rc3, rc9;
2294     int      ri0, ri1, i;
2295     real     scale, *vdwtab;
2296
2297     fr->enershiftsix    = 0;
2298     fr->enershifttwelve = 0;
2299     fr->enerdiffsix     = 0;
2300     fr->enerdifftwelve  = 0;
2301     fr->virdiffsix      = 0;
2302     fr->virdifftwelve   = 0;
2303
2304     const interaction_const_t *ic = fr->ic;
2305
2306     if (eDispCorr != edispcNO)
2307     {
2308         for (i = 0; i < 2; i++)
2309         {
2310             eners[i] = 0;
2311             virs[i]  = 0;
2312         }
2313         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2314             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2315             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2316             (ic->vdwtype == evdwSHIFT) ||
2317             (ic->vdwtype == evdwSWITCH))
2318         {
2319             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2320                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2321                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2322             {
2323                 gmx_fatal(FARGS,
2324                           "With dispersion correction rvdw-switch can not be zero "
2325                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2326             }
2327
2328             /* TODO This code depends on the logic in tables.c that
2329                constructs the table layout, which should be made
2330                explicit in future cleanup. */
2331             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2332                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2333             scale  = fr->dispersionCorrectionTable->scale;
2334             vdwtab = fr->dispersionCorrectionTable->data;
2335
2336             /* Round the cut-offs to exact table values for precision */
2337             ri0  = static_cast<int>(floor(ic->rvdw_switch*scale));
2338             ri1  = static_cast<int>(ceil(ic->rvdw*scale));
2339
2340             /* The code below has some support for handling force-switching, i.e.
2341              * when the force (instead of potential) is switched over a limited
2342              * region. This leads to a constant shift in the potential inside the
2343              * switching region, which we can handle by adding a constant energy
2344              * term in the force-switch case just like when we do potential-shift.
2345              *
2346              * For now this is not enabled, but to keep the functionality in the
2347              * code we check separately for switch and shift. When we do force-switch
2348              * the shifting point is rvdw_switch, while it is the cutoff when we
2349              * have a classical potential-shift.
2350              *
2351              * For a pure potential-shift the potential has a constant shift
2352              * all the way out to the cutoff, and that is it. For other forms
2353              * we need to calculate the constant shift up to the point where we
2354              * start modifying the potential.
2355              */
2356             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2357
2358             r0   = ri0/scale;
2359             rc3  = r0*r0*r0;
2360             rc9  = rc3*rc3*rc3;
2361
2362             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2363                 (ic->vdwtype == evdwSHIFT))
2364             {
2365                 /* Determine the constant energy shift below rvdw_switch.
2366                  * Table has a scale factor since we have scaled it down to compensate
2367                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2368                  */
2369                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2370                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2371             }
2372             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2373             {
2374                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2375                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2376             }
2377
2378             /* Add the constant part from 0 to rvdw_switch.
2379              * This integration from 0 to rvdw_switch overcounts the number
2380              * of interactions by 1, as it also counts the self interaction.
2381              * We will correct for this later.
2382              */
2383             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2384             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2385
2386             /* Calculate the contribution in the range [r0,r1] where we
2387              * modify the potential. For a pure potential-shift modifier we will
2388              * have ri0==ri1, and there will not be any contribution here.
2389              */
2390             for (i = 0; i < 2; i++)
2391             {
2392                 enersum = 0;
2393                 virsum  = 0;
2394                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2395                 eners[i] -= enersum;
2396                 virs[i]  -= virsum;
2397             }
2398
2399             /* Alright: Above we compensated by REMOVING the parts outside r0
2400              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2401              *
2402              * Regardless of whether r0 is the point where we start switching,
2403              * or the cutoff where we calculated the constant shift, we include
2404              * all the parts we are missing out to infinity from r0 by
2405              * calculating the analytical dispersion correction.
2406              */
2407             eners[0] += -4.0*M_PI/(3.0*rc3);
2408             eners[1] +=  4.0*M_PI/(9.0*rc9);
2409             virs[0]  +=  8.0*M_PI/rc3;
2410             virs[1]  += -16.0*M_PI/(3.0*rc9);
2411         }
2412         else if (ic->vdwtype == evdwCUT ||
2413                  EVDW_PME(ic->vdwtype) ||
2414                  ic->vdwtype == evdwUSER)
2415         {
2416             if (ic->vdwtype == evdwUSER && fplog)
2417             {
2418                 fprintf(fplog,
2419                         "WARNING: using dispersion correction with user tables\n");
2420             }
2421
2422             /* Note that with LJ-PME, the dispersion correction is multiplied
2423              * by the difference between the actual C6 and the value of C6
2424              * that would produce the combination rule.
2425              * This means the normal energy and virial difference formulas
2426              * can be used here.
2427              */
2428
2429             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2430             rc9  = rc3*rc3*rc3;
2431             /* Contribution beyond the cut-off */
2432             eners[0] += -4.0*M_PI/(3.0*rc3);
2433             eners[1] +=  4.0*M_PI/(9.0*rc9);
2434             if (ic->vdw_modifier == eintmodPOTSHIFT)
2435             {
2436                 /* Contribution within the cut-off */
2437                 eners[0] += -4.0*M_PI/(3.0*rc3);
2438                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2439             }
2440             /* Contribution beyond the cut-off */
2441             virs[0]  +=  8.0*M_PI/rc3;
2442             virs[1]  += -16.0*M_PI/(3.0*rc9);
2443         }
2444         else
2445         {
2446             gmx_fatal(FARGS,
2447                       "Dispersion correction is not implemented for vdw-type = %s",
2448                       evdw_names[ic->vdwtype]);
2449         }
2450
2451         /* When we deprecate the group kernels the code below can go too */
2452         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2453         {
2454             /* Calculate self-interaction coefficient (assuming that
2455              * the reciprocal-space contribution is constant in the
2456              * region that contributes to the self-interaction).
2457              */
2458             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2459
2460             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2461             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2462         }
2463
2464         fr->enerdiffsix    = eners[0];
2465         fr->enerdifftwelve = eners[1];
2466         /* The 0.5 is due to the Gromacs definition of the virial */
2467         fr->virdiffsix     = 0.5*virs[0];
2468         fr->virdifftwelve  = 0.5*virs[1];
2469     }
2470 }
2471
2472 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2473                    matrix box, real lambda, tensor pres, tensor virial,
2474                    real *prescorr, real *enercorr, real *dvdlcorr)
2475 {
2476     gmx_bool bCorrAll, bCorrPres;
2477     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2478     int      m;
2479
2480     *prescorr = 0;
2481     *enercorr = 0;
2482     *dvdlcorr = 0;
2483
2484     clear_mat(virial);
2485     clear_mat(pres);
2486
2487     if (ir->eDispCorr != edispcNO)
2488     {
2489         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2490                      ir->eDispCorr == edispcAllEnerPres);
2491         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2492                      ir->eDispCorr == edispcAllEnerPres);
2493
2494         invvol = 1/det(box);
2495         if (fr->n_tpi)
2496         {
2497             /* Only correct for the interactions with the inserted molecule */
2498             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2499             ninter = fr->n_tpi;
2500         }
2501         else
2502         {
2503             dens   = fr->numAtomsForDispersionCorrection*invvol;
2504             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2505         }
2506
2507         if (ir->efep == efepNO)
2508         {
2509             avcsix    = fr->avcsix[0];
2510             avctwelve = fr->avctwelve[0];
2511         }
2512         else
2513         {
2514             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2515             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2516         }
2517
2518         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2519         *enercorr += avcsix*enerdiff;
2520         dvdlambda  = 0.0;
2521         if (ir->efep != efepNO)
2522         {
2523             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2524         }
2525         if (bCorrAll)
2526         {
2527             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2528             *enercorr += avctwelve*enerdiff;
2529             if (fr->efep != efepNO)
2530             {
2531                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2532             }
2533         }
2534
2535         if (bCorrPres)
2536         {
2537             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2538             if (ir->eDispCorr == edispcAllEnerPres)
2539             {
2540                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2541             }
2542             /* The factor 2 is because of the Gromacs virial definition */
2543             spres = -2.0*invvol*svir*PRESFAC;
2544
2545             for (m = 0; m < DIM; m++)
2546             {
2547                 virial[m][m] += svir;
2548                 pres[m][m]   += spres;
2549             }
2550             *prescorr += spres;
2551         }
2552
2553         /* Can't currently control when it prints, for now, just print when degugging */
2554         if (debug)
2555         {
2556             if (bCorrAll)
2557             {
2558                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2559                         avcsix, avctwelve);
2560             }
2561             if (bCorrPres)
2562             {
2563                 fprintf(debug,
2564                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2565                         *enercorr, spres, svir);
2566             }
2567             else
2568             {
2569                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2570             }
2571         }
2572
2573         if (fr->efep != efepNO)
2574         {
2575             *dvdlcorr += dvdlambda;
2576         }
2577     }
2578 }
2579
2580 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2581                             const gmx_mtop_t *mtop, rvec x[],
2582                             gmx_bool bFirst)
2583 {
2584     t_graph        *graph;
2585     int             as, mol;
2586
2587     if (bFirst && fplog)
2588     {
2589         fprintf(fplog, "Removing pbc first time\n");
2590     }
2591
2592     snew(graph, 1);
2593     as = 0;
2594     for (const gmx_molblock_t &molb : mtop->molblock)
2595     {
2596         if (molb.natoms_mol == 1 ||
2597             (!bFirst && mtop->moltype[molb.type].cgs.nr == 1))
2598         {
2599             /* Just one atom or charge group in the molecule, no PBC required */
2600             as += molb.nmol*molb.natoms_mol;
2601         }
2602         else
2603         {
2604             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2605             mk_graph_ilist(nullptr, mtop->moltype[molb.type].ilist,
2606                            0, molb.natoms_mol, FALSE, FALSE, graph);
2607
2608             for (mol = 0; mol < molb.nmol; mol++)
2609             {
2610                 mk_mshift(fplog, graph, ePBC, box, x+as);
2611
2612                 shift_self(graph, box, x+as);
2613                 /* The molecule is whole now.
2614                  * We don't need the second mk_mshift call as in do_pbc_first,
2615                  * since we no longer need this graph.
2616                  */
2617
2618                 as += molb.natoms_mol;
2619             }
2620             done_graph(graph);
2621         }
2622     }
2623     sfree(graph);
2624 }
2625
2626 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2627                        const gmx_mtop_t *mtop, rvec x[])
2628 {
2629     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2630 }
2631
2632 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2633                  const gmx_mtop_t *mtop, rvec x[])
2634 {
2635     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2636 }
2637
2638 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2639 {
2640     int t, nth;
2641     nth = gmx_omp_nthreads_get(emntDefault);
2642
2643 #pragma omp parallel for num_threads(nth) schedule(static)
2644     for (t = 0; t < nth; t++)
2645     {
2646         try
2647         {
2648             size_t natoms = x.size();
2649             size_t offset = (natoms*t    )/nth;
2650             size_t len    = (natoms*(t + 1))/nth - offset;
2651             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2652         }
2653         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2654     }
2655 }
2656
2657 // TODO This can be cleaned up a lot, and move back to runner.cpp
2658 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2659                 const t_inputrec *inputrec,
2660                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2661                 gmx_walltime_accounting_t walltime_accounting,
2662                 nonbonded_verlet_t *nbv,
2663                 gmx_pme_t *pme,
2664                 gmx_bool bWriteStat)
2665 {
2666     t_nrnb *nrnb_tot = nullptr;
2667     double  delta_t  = 0;
2668     double  nbfs     = 0, mflop = 0;
2669     double  elapsed_time,
2670             elapsed_time_over_all_ranks,
2671             elapsed_time_over_all_threads,
2672             elapsed_time_over_all_threads_over_all_ranks;
2673     /* Control whether it is valid to print a report. Only the
2674        simulation master may print, but it should not do so if the run
2675        terminated e.g. before a scheduled reset step. This is
2676        complicated by the fact that PME ranks are unaware of the
2677        reason why they were sent a pmerecvqxFINISH. To avoid
2678        communication deadlocks, we always do the communication for the
2679        report, even if we've decided not to write the report, because
2680        how long it takes to finish the run is not important when we've
2681        decided not to report on the simulation performance.
2682
2683        Further, we only report performance for dynamical integrators,
2684        because those are the only ones for which we plan to
2685        consider doing any optimizations. */
2686     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2687
2688     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2689     {
2690         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2691         printReport = false;
2692     }
2693
2694     if (cr->nnodes > 1)
2695     {
2696         snew(nrnb_tot, 1);
2697 #if GMX_MPI
2698         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2699                       cr->mpi_comm_mysim);
2700 #endif
2701     }
2702     else
2703     {
2704         nrnb_tot = nrnb;
2705     }
2706
2707     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2708     elapsed_time_over_all_ranks                  = elapsed_time;
2709     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2710     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2711 #if GMX_MPI
2712     if (cr->nnodes > 1)
2713     {
2714         /* reduce elapsed_time over all MPI ranks in the current simulation */
2715         MPI_Allreduce(&elapsed_time,
2716                       &elapsed_time_over_all_ranks,
2717                       1, MPI_DOUBLE, MPI_SUM,
2718                       cr->mpi_comm_mysim);
2719         elapsed_time_over_all_ranks /= cr->nnodes;
2720         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2721          * current simulation. */
2722         MPI_Allreduce(&elapsed_time_over_all_threads,
2723                       &elapsed_time_over_all_threads_over_all_ranks,
2724                       1, MPI_DOUBLE, MPI_SUM,
2725                       cr->mpi_comm_mysim);
2726     }
2727 #endif
2728
2729     if (printReport)
2730     {
2731         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2732     }
2733     if (cr->nnodes > 1)
2734     {
2735         sfree(nrnb_tot);
2736     }
2737
2738     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2739     {
2740         print_dd_statistics(cr, inputrec, fplog);
2741     }
2742
2743     /* TODO Move the responsibility for any scaling by thread counts
2744      * to the code that handled the thread region, so that there's a
2745      * mechanism to keep cycle counting working during the transition
2746      * to task parallelism. */
2747     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2748     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2749     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2750     auto cycle_sum(wallcycle_sum(cr, wcycle));
2751
2752     if (printReport)
2753     {
2754         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2755         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2756         if (pme_gpu_task_enabled(pme))
2757         {
2758             pme_gpu_get_timings(pme, &pme_gpu_timings);
2759         }
2760         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2761                         elapsed_time_over_all_ranks,
2762                         wcycle, cycle_sum,
2763                         nbnxn_gpu_timings,
2764                         &pme_gpu_timings);
2765
2766         if (EI_DYNAMICS(inputrec->eI))
2767         {
2768             delta_t = inputrec->delta_t;
2769         }
2770
2771         if (fplog)
2772         {
2773             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2774                        elapsed_time_over_all_ranks,
2775                        walltime_accounting_get_nsteps_done(walltime_accounting),
2776                        delta_t, nbfs, mflop);
2777         }
2778         if (bWriteStat)
2779         {
2780             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2781                        elapsed_time_over_all_ranks,
2782                        walltime_accounting_get_nsteps_done(walltime_accounting),
2783                        delta_t, nbfs, mflop);
2784         }
2785     }
2786 }
2787
2788 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2789 {
2790     /* this function works, but could probably use a logic rewrite to keep all the different
2791        types of efep straight. */
2792
2793     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2794     {
2795         return;
2796     }
2797
2798     t_lambda *fep = ir->fepvals;
2799     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2800                                             if checkpoint is set -- a kludge is in for now
2801                                             to prevent this.*/
2802
2803     for (int i = 0; i < efptNR; i++)
2804     {
2805         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2806         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2807         {
2808             lambda[i] = fep->init_lambda;
2809             if (lam0)
2810             {
2811                 lam0[i] = lambda[i];
2812             }
2813         }
2814         else
2815         {
2816             lambda[i] = fep->all_lambda[i][*fep_state];
2817             if (lam0)
2818             {
2819                 lam0[i] = lambda[i];
2820             }
2821         }
2822     }
2823     if (ir->bSimTemp)
2824     {
2825         /* need to rescale control temperatures to match current state */
2826         for (int i = 0; i < ir->opts.ngtc; i++)
2827         {
2828             if (ir->opts.ref_t[i] > 0)
2829             {
2830                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2831             }
2832         }
2833     }
2834
2835     /* Send to the log the information on the current lambdas */
2836     if (fplog != nullptr)
2837     {
2838         fprintf(fplog, "Initial vector of lambda components:[ ");
2839         for (const auto &l : lambda)
2840         {
2841             fprintf(fplog, "%10.4f ", l);
2842         }
2843         fprintf(fplog, "]\n");
2844     }
2845     return;
2846 }
2847
2848
2849 void init_md(FILE *fplog,
2850              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2851              t_inputrec *ir, const gmx_output_env_t *oenv,
2852              const MdrunOptions &mdrunOptions,
2853              double *t, double *t0,
2854              t_state *globalState, double *lam0,
2855              t_nrnb *nrnb, gmx_mtop_t *mtop,
2856              gmx_update_t **upd,
2857              int nfile, const t_filenm fnm[],
2858              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2859              tensor force_vir, tensor shake_vir, rvec mu_tot,
2860              gmx_bool *bSimAnn, t_vcm **vcm,
2861              gmx_wallcycle_t wcycle)
2862 {
2863     int  i;
2864
2865     /* Initial values */
2866     *t = *t0       = ir->init_t;
2867
2868     *bSimAnn = FALSE;
2869     for (i = 0; i < ir->opts.ngtc; i++)
2870     {
2871         /* set bSimAnn if any group is being annealed */
2872         if (ir->opts.annealing[i] != eannNO)
2873         {
2874             *bSimAnn = TRUE;
2875         }
2876     }
2877
2878     /* Initialize lambda variables */
2879     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2880      * We currently need to call initialize_lambdas on non-master ranks
2881      * to initialize lam0.
2882      */
2883     if (MASTER(cr))
2884     {
2885         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2886     }
2887     else
2888     {
2889         int                      tmpFepState;
2890         std::array<real, efptNR> tmpLambda;
2891         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2892     }
2893
2894     // TODO upd is never NULL in practice, but the analysers don't know that
2895     if (upd)
2896     {
2897         *upd = init_update(ir);
2898     }
2899     if (*bSimAnn)
2900     {
2901         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2902     }
2903
2904     if (vcm != nullptr)
2905     {
2906         *vcm = init_vcm(fplog, &mtop->groups, ir);
2907     }
2908
2909     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2910     {
2911         if (ir->etc == etcBERENDSEN)
2912         {
2913             please_cite(fplog, "Berendsen84a");
2914         }
2915         if (ir->etc == etcVRESCALE)
2916         {
2917             please_cite(fplog, "Bussi2007a");
2918         }
2919         if (ir->eI == eiSD1)
2920         {
2921             please_cite(fplog, "Goga2012");
2922         }
2923     }
2924     init_nrnb(nrnb);
2925
2926     if (nfile != -1)
2927     {
2928         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2929
2930         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2931                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2932     }
2933
2934     /* Initiate variables */
2935     clear_mat(force_vir);
2936     clear_mat(shake_vir);
2937     clear_rvec(mu_tot);
2938 }