src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <cmath>
  44 #include <cstdint>
  45 #include <cstdio>
  46 #include <cstring>
  47
  48 #include <array>
  49
  50 #include "gromacs/awh/awh.h"
  51 #include "gromacs/domdec/dlbtiming.h"
  52 #include "gromacs/domdec/domdec.h"
  53 #include "gromacs/domdec/domdec_struct.h"
  54 #include "gromacs/essentialdynamics/edsam.h"
  55 #include "gromacs/ewald/pme.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/imd/imd.h"
  64 #include "gromacs/listed-forces/bonded.h"
  65 #include "gromacs/listed-forces/disre.h"
  66 #include "gromacs/listed-forces/orires.h"
  67 #include "gromacs/math/functions.h"
  68 #include "gromacs/math/units.h"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/math/vecdump.h"
  71 #include "gromacs/mdlib/calcmu.h"
  72 #include "gromacs/mdlib/calcvir.h"
  73 #include "gromacs/mdlib/constr.h"
  74 #include "gromacs/mdlib/force.h"
  75 #include "gromacs/mdlib/forcerec.h"
  76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  77 #include "gromacs/mdlib/mdrun.h"
  78 #include "gromacs/mdlib/nb_verlet.h"
  79 #include "gromacs/mdlib/nbnxn_atomdata.h"
  80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  81 #include "gromacs/mdlib/nbnxn_grid.h"
  82 #include "gromacs/mdlib/nbnxn_search.h"
  83 #include "gromacs/mdlib/qmmm.h"
  84 #include "gromacs/mdlib/update.h"
  85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  86 #include "gromacs/mdtypes/commrec.h"
  87 #include "gromacs/mdtypes/forceoutput.h"
  88 #include "gromacs/mdtypes/iforceprovider.h"
  89 #include "gromacs/mdtypes/inputrec.h"
  90 #include "gromacs/mdtypes/md_enums.h"
  91 #include "gromacs/mdtypes/state.h"
  92 #include "gromacs/pbcutil/ishift.h"
  93 #include "gromacs/pbcutil/mshift.h"
  94 #include "gromacs/pbcutil/pbc.h"
  95 #include "gromacs/pulling/pull.h"
  96 #include "gromacs/pulling/pull_rotation.h"
  97 #include "gromacs/timing/cyclecounter.h"
  98 #include "gromacs/timing/gpu_timing.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/wallcyclereporting.h"
 101 #include "gromacs/timing/walltime_accounting.h"
 102 #include "gromacs/topology/topology.h"
 103 #include "gromacs/utility/arrayref.h"
 104 #include "gromacs/utility/basedefinitions.h"
 105 #include "gromacs/utility/cstringutil.h"
 106 #include "gromacs/utility/exceptions.h"
 107 #include "gromacs/utility/fatalerror.h"
 108 #include "gromacs/utility/gmxassert.h"
 109 #include "gromacs/utility/gmxmpi.h"
 110 #include "gromacs/utility/logger.h"
 111 #include "gromacs/utility/pleasecite.h"
 112 #include "gromacs/utility/smalloc.h"
 113 #include "gromacs/utility/sysinfo.h"
 114
 115 #include "nbnxn_gpu.h"
 116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 118
 119 // TODO: this environment variable allows us to verify before release
 120 // that on less common architectures the total cost of polling is not larger than
 121 // a blocking wait (so polling does not introduce overhead when the static
 122 // PME-first ordering would suffice).
 123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 124
 125
 126 void print_time(FILE                     *out,
 127                 gmx_walltime_accounting_t walltime_accounting,
 128                 int64_t                   step,
 129                 t_inputrec               *ir,
 130                 const t_commrec          *cr)
 131 {
 132     time_t finish;
 133     char   timebuf[STRLEN];
 134     double dt, elapsed_seconds, time_per_step;
 135     char   buf[48];
 136
 137 #if !GMX_THREAD_MPI
 138     if (!PAR(cr))
 139 #endif
 140     {
 141         fprintf(out, "\r");
 142     }
 143     fprintf(out, "step %s", gmx_step_str(step, buf));
 144     fflush(out);
 145
 146     if ((step >= ir->nstlist))
 147     {
 148         double seconds_since_epoch = gmx_gettime();
 149         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 150         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 151         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 152
 153         if (ir->nsteps >= 0)
 154         {
 155             if (dt >= 300)
 156             {
 157                 finish = static_cast<time_t>(seconds_since_epoch + dt);
 158                 gmx_ctime_r(&finish, timebuf, STRLEN);
 159                 sprintf(buf, "%s", timebuf);
 160                 buf[strlen(buf)-1] = '\0';
 161                 fprintf(out, ", will finish %s", buf);
 162             }
 163             else
 164             {
 165                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
 166             }
 167         }
 168         else
 169         {
 170             fprintf(out, " performance: %.1f ns/day    ",
 171                     ir->delta_t/1000*24*60*60/time_per_step);
 172         }
 173     }
 174 #if !GMX_THREAD_MPI
 175     if (PAR(cr))
 176     {
 177         fprintf(out, "\n");
 178     }
 179 #else
 180     GMX_UNUSED_VALUE(cr);
 181 #endif
 182
 183     fflush(out);
 184 }
 185
 186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 187                          double the_time)
 188 {
 189     char   time_string[STRLEN];
 190
 191     if (!fplog)
 192     {
 193         return;
 194     }
 195
 196     {
 197         int    i;
 198         char   timebuf[STRLEN];
 199         time_t temp_time = static_cast<time_t>(the_time);
 200
 201         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 202         for (i = 0; timebuf[i] >= ' '; i++)
 203         {
 204             time_string[i] = timebuf[i];
 205         }
 206         time_string[i] = '\0';
 207     }
 208
 209     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 210 }
 211
 212 void print_start(FILE *fplog, const t_commrec *cr,
 213                  gmx_walltime_accounting_t walltime_accounting,
 214                  const char *name)
 215 {
 216     char buf[STRLEN];
 217
 218     sprintf(buf, "Started %s", name);
 219     print_date_and_time(fplog, cr->nodeid, buf,
 220                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 221 }
 222
 223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 224 {
 225     const int      end = forceToAdd.size();
 226
 227     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 228 #pragma omp parallel for num_threads(nt) schedule(static)
 229     for (int i = 0; i < end; i++)
 230     {
 231         rvec_inc(f[i], forceToAdd[i]);
 232     }
 233 }
 234
 235 static void pme_gpu_reduce_outputs(gmx_wallcycle_t                 wcycle,
 236                                    gmx::ForceWithVirial           *forceWithVirial,
 237                                    gmx::ArrayRef<const gmx::RVec>  pmeForces,
 238                                    gmx_enerdata_t                 *enerd,
 239                                    const tensor                    vir_Q,
 240                                    real                            Vlr_q)
 241 {
 242     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 243     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 244     forceWithVirial->addVirialContribution(vir_Q);
 245     enerd->term[F_COUL_RECIP] += Vlr_q;
 246     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 247     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 248 }
 249
 250 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 251                         tensor vir_part, t_graph *graph, matrix box,
 252                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 253 {
 254     int    i;
 255
 256     /* The short-range virial from surrounding boxes */
 257     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 258     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 259
 260     /* Calculate partial virial, for local atoms only, based on short range.
 261      * Total virial is computed in global_stat, called from do_md
 262      */
 263     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 264     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 265
 266     /* Add wall contribution */
 267     for (i = 0; i < DIM; i++)
 268     {
 269         vir_part[i][ZZ] += fr->vir_wall_z[i];
 270     }
 271
 272     if (debug)
 273     {
 274         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 275     }
 276 }
 277
 278 static void pull_potential_wrapper(const t_commrec *cr,
 279                                    const t_inputrec *ir,
 280                                    matrix box, gmx::ArrayRef<const gmx::RVec> x,
 281                                    gmx::ForceWithVirial *force,
 282                                    const t_mdatoms *mdatoms,
 283                                    gmx_enerdata_t *enerd,
 284                                    real *lambda,
 285                                    double t,
 286                                    gmx_wallcycle_t wcycle)
 287 {
 288     t_pbc  pbc;
 289     real   dvdl;
 290
 291     /* Calculate the center of mass forces, this requires communication,
 292      * which is why pull_potential is called close to other communication.
 293      */
 294     wallcycle_start(wcycle, ewcPULLPOT);
 295     set_pbc(&pbc, ir->ePBC, box);
 296     dvdl                     = 0;
 297     enerd->term[F_COM_PULL] +=
 298         pull_potential(ir->pull_work, mdatoms, &pbc,
 299                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 300     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 301     wallcycle_stop(wcycle, ewcPULLPOT);
 302 }
 303
 304 static void pme_receive_force_ener(const t_commrec      *cr,
 305                                    gmx::ForceWithVirial *forceWithVirial,
 306                                    gmx_enerdata_t       *enerd,
 307                                    gmx_wallcycle_t       wcycle)
 308 {
 309     real   e_q, e_lj, dvdl_q, dvdl_lj;
 310     float  cycles_ppdpme, cycles_seppme;
 311
 312     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 313     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 314
 315     /* In case of node-splitting, the PP nodes receive the long-range
 316      * forces, virial and energy from the PME nodes here.
 317      */
 318     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 319     dvdl_q  = 0;
 320     dvdl_lj = 0;
 321     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 322                       &cycles_seppme);
 323     enerd->term[F_COUL_RECIP] += e_q;
 324     enerd->term[F_LJ_RECIP]   += e_lj;
 325     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 326     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 327
 328     if (wcycle)
 329     {
 330         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 331     }
 332     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 333 }
 334
 335 static void print_large_forces(FILE            *fp,
 336                                const t_mdatoms *md,
 337                                const t_commrec *cr,
 338                                int64_t          step,
 339                                real             forceTolerance,
 340                                const rvec      *x,
 341                                const rvec      *f)
 342 {
 343     real           force2Tolerance = gmx::square(forceTolerance);
 344     std::uintmax_t numNonFinite    = 0;
 345     for (int i = 0; i < md->homenr; i++)
 346     {
 347         real force2    = norm2(f[i]);
 348         bool nonFinite = !std::isfinite(force2);
 349         if (force2 >= force2Tolerance || nonFinite)
 350         {
 351             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 352                     step,
 353                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 354         }
 355         if (nonFinite)
 356         {
 357             numNonFinite++;
 358         }
 359     }
 360     if (numNonFinite > 0)
 361     {
 362         /* Note that with MPI this fatal call on one rank might interrupt
 363          * the printing on other ranks. But we can only avoid that with
 364          * an expensive MPI barrier that we would need at each step.
 365          */
 366         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 367     }
 368 }
 369
 370 static void post_process_forces(const t_commrec           *cr,
 371                                 int64_t                    step,
 372                                 t_nrnb                    *nrnb,
 373                                 gmx_wallcycle_t            wcycle,
 374                                 const gmx_localtop_t      *top,
 375                                 matrix                     box,
 376                                 rvec                       x[],
 377                                 rvec                       f[],
 378                                 gmx::ForceWithVirial      *forceWithVirial,
 379                                 tensor                     vir_force,
 380                                 const t_mdatoms           *mdatoms,
 381                                 t_graph                   *graph,
 382                                 t_forcerec                *fr,
 383                                 const gmx_vsite_t         *vsite,
 384                                 int                        flags)
 385 {
 386     if (fr->haveDirectVirialContributions)
 387     {
 388         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 389
 390         if (vsite)
 391         {
 392             /* Spread the mesh force on virtual sites to the other particles...
 393              * This is parallellized. MPI communication is performed
 394              * if the constructing atoms aren't local.
 395              */
 396             matrix virial = { { 0 } };
 397             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 398                            (flags & GMX_FORCE_VIRIAL), virial,
 399                            nrnb,
 400                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 401             forceWithVirial->addVirialContribution(virial);
 402         }
 403
 404         if (flags & GMX_FORCE_VIRIAL)
 405         {
 406             /* Now add the forces, this is local */
 407             sum_forces(f, forceWithVirial->force_);
 408
 409             /* Add the direct virial contributions */
 410             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 411             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 412
 413             if (debug)
 414             {
 415                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 416             }
 417         }
 418     }
 419
 420     if (fr->print_force >= 0)
 421     {
 422         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 423     }
 424 }
 425
 426 static void do_nb_verlet(t_forcerec *fr,
 427                          interaction_const_t *ic,
 428                          gmx_enerdata_t *enerd,
 429                          int flags, int ilocality,
 430                          int clearF,
 431                          int64_t step,
 432                          t_nrnb *nrnb,
 433                          gmx_wallcycle_t wcycle)
 434 {
 435     if (!(flags & GMX_FORCE_NONBONDED))
 436     {
 437         /* skip non-bonded calculation */
 438         return;
 439     }
 440
 441     nonbonded_verlet_t       *nbv  = fr->nbv;
 442     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 443
 444     /* GPU kernel launch overhead is already timed separately */
 445     if (fr->cutoff_scheme != ecutsVERLET)
 446     {
 447         gmx_incons("Invalid cut-off scheme passed!");
 448     }
 449
 450     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 451
 452     if (!bUsingGpuKernels)
 453     {
 454         /* When dynamic pair-list  pruning is requested, we need to prune
 455          * at nstlistPrune steps.
 456          */
 457         if (nbv->listParams->useDynamicPruning &&
 458             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 459         {
 460             /* Prune the pair-list beyond fr->ic->rlistPrune using
 461              * the current coordinates of the atoms.
 462              */
 463             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 464             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 465             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 466         }
 467
 468         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 469     }
 470
 471     switch (nbvg->kernel_type)
 472     {
 473         case nbnxnk4x4_PlainC:
 474         case nbnxnk4xN_SIMD_4xN:
 475         case nbnxnk4xN_SIMD_2xNN:
 476             nbnxn_kernel_cpu(nbvg,
 477                              nbv->nbat,
 478                              ic,
 479                              fr->shift_vec,
 480                              flags,
 481                              clearF,
 482                              fr->fshift[0],
 483                              enerd->grpp.ener[egCOULSR],
 484                              fr->bBHAM ?
 485                              enerd->grpp.ener[egBHAMSR] :
 486                              enerd->grpp.ener[egLJSR]);
 487             break;
 488
 489         case nbnxnk8x8x8_GPU:
 490             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 491             break;
 492
 493         case nbnxnk8x8x8_PlainC:
 494             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 495                                  nbv->nbat, ic,
 496                                  fr->shift_vec,
 497                                  flags,
 498                                  clearF,
 499                                  nbv->nbat->out[0].f,
 500                                  fr->fshift[0],
 501                                  enerd->grpp.ener[egCOULSR],
 502                                  fr->bBHAM ?
 503                                  enerd->grpp.ener[egBHAMSR] :
 504                                  enerd->grpp.ener[egLJSR]);
 505             break;
 506
 507         default:
 508             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 509
 510     }
 511     if (!bUsingGpuKernels)
 512     {
 513         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 514     }
 515
 516     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 517     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 518     {
 519         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 520     }
 521     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 522              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 523     {
 524         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 525     }
 526     else
 527     {
 528         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 529     }
 530     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 531     if (flags & GMX_FORCE_ENERGY)
 532     {
 533         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 534         enr_nbnxn_kernel_ljc += 1;
 535         enr_nbnxn_kernel_lj  += 1;
 536     }
 537
 538     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 539              nbvg->nbl_lists.natpair_ljq);
 540     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 541              nbvg->nbl_lists.natpair_lj);
 542     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 543     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 544              nbvg->nbl_lists.natpair_q);
 545
 546     if (ic->vdw_modifier == eintmodFORCESWITCH)
 547     {
 548         /* We add up the switch cost separately */
 549         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 550                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 551     }
 552     if (ic->vdw_modifier == eintmodPOTSWITCH)
 553     {
 554         /* We add up the switch cost separately */
 555         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 556                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 557     }
 558     if (ic->vdwtype == evdwPME)
 559     {
 560         /* We add up the LJ Ewald cost separately */
 561         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 562                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 563     }
 564 }
 565
 566 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 567                              t_forcerec           *fr,
 568                              rvec                  x[],
 569                              rvec                  f[],
 570                              const t_mdatoms      *mdatoms,
 571                              t_lambda             *fepvals,
 572                              real                 *lambda,
 573                              gmx_enerdata_t       *enerd,
 574                              int                   flags,
 575                              t_nrnb               *nrnb,
 576                              gmx_wallcycle_t       wcycle)
 577 {
 578     int              donb_flags;
 579     nb_kernel_data_t kernel_data;
 580     real             lam_i[efptNR];
 581     real             dvdl_nb[efptNR];
 582     int              th;
 583     int              i, j;
 584
 585     donb_flags = 0;
 586     /* Add short-range interactions */
 587     donb_flags |= GMX_NONBONDED_DO_SR;
 588
 589     /* Currently all group scheme kernels always calculate (shift-)forces */
 590     if (flags & GMX_FORCE_FORCES)
 591     {
 592         donb_flags |= GMX_NONBONDED_DO_FORCE;
 593     }
 594     if (flags & GMX_FORCE_VIRIAL)
 595     {
 596         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 597     }
 598     if (flags & GMX_FORCE_ENERGY)
 599     {
 600         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 601     }
 602
 603     kernel_data.flags  = donb_flags;
 604     kernel_data.lambda = lambda;
 605     kernel_data.dvdl   = dvdl_nb;
 606
 607     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 608     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 609
 610     /* reset free energy components */
 611     for (i = 0; i < efptNR; i++)
 612     {
 613         dvdl_nb[i]  = 0;
 614     }
 615
 616     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 617
 618     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 619 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 620     for (th = 0; th < nbl_lists->nnbl; th++)
 621     {
 622         try
 623         {
 624             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 625                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 626         }
 627         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 628     }
 629
 630     if (fepvals->sc_alpha != 0)
 631     {
 632         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 633         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 634     }
 635     else
 636     {
 637         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 638         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 639     }
 640
 641     /* If we do foreign lambda and we have soft-core interactions
 642      * we have to recalculate the (non-linear) energies contributions.
 643      */
 644     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 645     {
 646         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 647         kernel_data.lambda         = lam_i;
 648         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 649         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 650         /* Note that we add to kernel_data.dvdl, but ignore the result */
 651
 652         for (i = 0; i < enerd->n_lambda; i++)
 653         {
 654             for (j = 0; j < efptNR; j++)
 655             {
 656                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 657             }
 658             reset_foreign_enerdata(enerd);
 659 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 660             for (th = 0; th < nbl_lists->nnbl; th++)
 661             {
 662                 try
 663                 {
 664                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 665                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 666                 }
 667                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 668             }
 669
 670             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 671             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 672         }
 673     }
 674
 675     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 676 }
 677
 678 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 679 {
 680     return nbv != nullptr && nbv->bUseGPU;
 681 }
 682
 683 static inline void clear_rvecs_omp(int n, rvec v[])
 684 {
 685     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 686
 687     /* Note that we would like to avoid this conditional by putting it
 688      * into the omp pragma instead, but then we still take the full
 689      * omp parallel for overhead (at least with gcc5).
 690      */
 691     if (nth == 1)
 692     {
 693         for (int i = 0; i < n; i++)
 694         {
 695             clear_rvec(v[i]);
 696         }
 697     }
 698     else
 699     {
 700 #pragma omp parallel for num_threads(nth) schedule(static)
 701         for (int i = 0; i < n; i++)
 702         {
 703             clear_rvec(v[i]);
 704         }
 705     }
 706 }
 707
 708 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 709  *
 710  * \param groupOptions  Group options, containing T-coupling options
 711  */
 712 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 713 {
 714     real nrdfCoupled   = 0;
 715     real nrdfUncoupled = 0;
 716     real kineticEnergy = 0;
 717     for (int g = 0; g < groupOptions.ngtc; g++)
 718     {
 719         if (groupOptions.tau_t[g] >= 0)
 720         {
 721             nrdfCoupled   += groupOptions.nrdf[g];
 722             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 723         }
 724         else
 725         {
 726             nrdfUncoupled += groupOptions.nrdf[g];
 727         }
 728     }
 729
 730     /* This conditional with > also catches nrdf=0 */
 731     if (nrdfCoupled > nrdfUncoupled)
 732     {
 733         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 734     }
 735     else
 736     {
 737         return 0;
 738     }
 739 }
 740
 741 /*! \brief This routine checks that the potential energy is finite.
 742  *
 743  * Always checks that the potential energy is finite. If step equals
 744  * inputrec.init_step also checks that the magnitude of the potential energy
 745  * is reasonable. Terminates with a fatal error when a check fails.
 746  * Note that passing this check does not guarantee finite forces,
 747  * since those use slightly different arithmetics. But in most cases
 748  * there is just a narrow coordinate range where forces are not finite
 749  * and energies are finite.
 750  *
 751  * \param[in] step      The step number, used for checking and printing
 752  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 753  * \param[in] inputrec  The input record
 754  */
 755 static void checkPotentialEnergyValidity(int64_t               step,
 756                                          const gmx_enerdata_t &enerd,
 757                                          const t_inputrec     &inputrec)
 758 {
 759     /* Threshold valid for comparing absolute potential energy against
 760      * the kinetic energy. Normally one should not consider absolute
 761      * potential energy values, but with a factor of one million
 762      * we should never get false positives.
 763      */
 764     constexpr real c_thresholdFactor = 1e6;
 765
 766     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 767     real           averageKineticEnergy = 0;
 768     /* We only check for large potential energy at the initial step,
 769      * because that is by far the most likely step for this too occur
 770      * and because computing the average kinetic energy is not free.
 771      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 772      * before they become NaN.
 773      */
 774     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 775     {
 776         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 777     }
 778
 779     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 780                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 781     {
 782         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 783                   step,
 784                   enerd.term[F_EPOT],
 785                   energyIsNotFinite ? "not finite" : "extremely high",
 786                   enerd.term[F_LJ],
 787                   enerd.term[F_COUL_SR],
 788                   energyIsNotFinite ? "non-finite" : "very high",
 789                   energyIsNotFinite ? " or Nan" : "");
 790     }
 791 }
 792
 793 /*! \brief Compute forces and/or energies for special algorithms
 794  *
 795  * The intention is to collect all calls to algorithms that compute
 796  * forces on local atoms only and that do not contribute to the local
 797  * virial sum (but add their virial contribution separately).
 798  * Eventually these should likely all become ForceProviders.
 799  * Within this function the intention is to have algorithms that do
 800  * global communication at the end, so global barriers within the MD loop
 801  * are as close together as possible.
 802  *
 803  * \param[in]     fplog            The log file
 804  * \param[in]     cr               The communication record
 805  * \param[in]     inputrec         The input record
 806  * \param[in]     awh              The Awh module (nullptr if none in use).
 807  * \param[in]     enforcedRotation Enforced rotation module.
 808  * \param[in]     step             The current MD step
 809  * \param[in]     t                The current time
 810  * \param[in,out] wcycle           Wallcycle accounting struct
 811  * \param[in,out] forceProviders   Pointer to a list of force providers
 812  * \param[in]     box              The unit cell
 813  * \param[in]     x                The coordinates
 814  * \param[in]     mdatoms          Per atom properties
 815  * \param[in]     lambda           Array of free-energy lambda values
 816  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 817  * \param[in,out] forceWithVirial  Force and virial buffers
 818  * \param[in,out] enerd            Energy buffer
 819  * \param[in,out] ed               Essential dynamics pointer
 820  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 821  *
 822  * \todo Remove bNS, which is used incorrectly.
 823  * \todo Convert all other algorithms called here to ForceProviders.
 824  */
 825 static void
 826 computeSpecialForces(FILE                          *fplog,
 827                      const t_commrec               *cr,
 828                      const t_inputrec              *inputrec,
 829                      gmx::Awh                      *awh,
 830                      gmx_enfrot                    *enforcedRotation,
 831                      int64_t                        step,
 832                      double                         t,
 833                      gmx_wallcycle_t                wcycle,
 834                      ForceProviders                *forceProviders,
 835                      matrix                         box,
 836                      gmx::ArrayRef<const gmx::RVec> x,
 837                      const t_mdatoms               *mdatoms,
 838                      real                          *lambda,
 839                      int                            forceFlags,
 840                      gmx::ForceWithVirial          *forceWithVirial,
 841                      gmx_enerdata_t                *enerd,
 842                      const gmx_edsam               *ed,
 843                      gmx_bool                       bNS)
 844 {
 845     const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
 846
 847     /* NOTE: Currently all ForceProviders only provide forces.
 848      *       When they also provide energies, remove this conditional.
 849      */
 850     if (computeForces)
 851     {
 852         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 853         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 854
 855         /* Collect forces from modules */
 856         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 857     }
 858
 859     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 860     {
 861         pull_potential_wrapper(cr, inputrec, box, x,
 862                                forceWithVirial,
 863                                mdatoms, enerd, lambda, t,
 864                                wcycle);
 865
 866         if (awh)
 867         {
 868             enerd->term[F_COM_PULL] +=
 869                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 870                                                   forceWithVirial,
 871                                                   t, step, wcycle, fplog);
 872         }
 873     }
 874
 875     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 876
 877     /* Add the forces from enforced rotation potentials (if any) */
 878     if (inputrec->bRot)
 879     {
 880         wallcycle_start(wcycle, ewcROTadd);
 881         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
 882         wallcycle_stop(wcycle, ewcROTadd);
 883     }
 884
 885     if (ed)
 886     {
 887         /* Note that since init_edsam() is called after the initialization
 888          * of forcerec, edsam doesn't request the noVirSum force buffer.
 889          * Thus if no other algorithm (e.g. PME) requires it, the forces
 890          * here will contribute to the virial.
 891          */
 892         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 893     }
 894
 895     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 896     if (inputrec->bIMD && computeForces)
 897     {
 898         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 899     }
 900 }
 901
 902 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 903  *
 904  * \param[in]  pmedata       The PME structure
 905  * \param[in]  box           The box matrix
 906  * \param[in]  x             Coordinate array
 907  * \param[in]  flags         Force flags
 908  * \param[in]  wcycle        The wallcycle structure
 909  */
 910 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 911                                       matrix          box,
 912                                       rvec            x[],
 913                                       int             flags,
 914                                       gmx_wallcycle_t wcycle)
 915 {
 916     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 917     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 918     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 919
 920     pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
 921     pme_gpu_launch_spread(pmedata, x, wcycle);
 922 }
 923
 924 /*! \brief Launch the FFT and gather stages of PME GPU
 925  *
 926  * This function only implements setting the output forces (no accumulation).
 927  *
 928  * \param[in]  pmedata        The PME structure
 929  * \param[in]  wcycle         The wallcycle structure
 930  */
 931 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 932                                      gmx_wallcycle_t   wcycle)
 933 {
 934     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 935     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 936 }
 937
 938 /*! \brief
 939  *  Polling wait for either of the PME or nonbonded GPU tasks.
 940  *
 941  * Instead of a static order in waiting for GPU tasks, this function
 942  * polls checking which of the two tasks completes first, and does the
 943  * associated force buffer reduction overlapped with the other task.
 944  * By doing that, unlike static scheduling order, it can always overlap
 945  * one of the reductions, regardless of the GPU task completion order.
 946  *
 947  * \param[in]     nbv              Nonbonded verlet structure
 948  * \param[in]     pmedata          PME module data
 949  * \param[in,out] force            Force array to reduce task outputs into.
 950  * \param[in,out] forceWithVirial  Force and virial buffers
 951  * \param[in,out] fshift           Shift force output vector results are reduced into
 952  * \param[in,out] enerd            Energy data structure results are reduced into
 953  * \param[in]     flags            Force flags
 954  * \param[in]     wcycle           The wallcycle structure
 955  */
 956 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
 957                                         const gmx_pme_t                     *pmedata,
 958                                         gmx::PaddedArrayRef<gmx::RVec>      *force,
 959                                         gmx::ForceWithVirial                *forceWithVirial,
 960                                         rvec                                 fshift[],
 961                                         gmx_enerdata_t                      *enerd,
 962                                         int                                  flags,
 963                                         gmx_wallcycle_t                      wcycle)
 964 {
 965     bool isPmeGpuDone = false;
 966     bool isNbGpuDone  = false;
 967
 968
 969     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 970
 971     while (!isPmeGpuDone || !isNbGpuDone)
 972     {
 973         if (!isPmeGpuDone)
 974         {
 975             matrix            vir_Q;
 976             real              Vlr_q;
 977
 978             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 979             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 980                                                    vir_Q, &Vlr_q, completionType);
 981
 982             if (isPmeGpuDone)
 983             {
 984                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 985                                        enerd, vir_Q, Vlr_q);
 986             }
 987         }
 988
 989         if (!isNbGpuDone)
 990         {
 991             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 992             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 993             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 994                                                     flags, eatLocal,
 995                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 996                                                     fshift, completionType);
 997             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 998             // To get the call count right, when the task finished we
 999             // issue a start/stop.
1000             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1001             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1002             if (isNbGpuDone)
1003             {
1004                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1005                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1006
1007                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1008                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
1009             }
1010         }
1011     }
1012 }
1013
1014 /*! \brief
1015  *  Launch the dynamic rolling pruning GPU task.
1016  *
1017  *  We currently alternate local/non-local list pruning in odd-even steps
1018  *  (only pruning every second step without DD).
1019  *
1020  * \param[in]     cr               The communication record
1021  * \param[in]     nbv              Nonbonded verlet structure
1022  * \param[in]     inputrec         The input record
1023  * \param[in]     step             The current MD step
1024  */
1025 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1026                                            const nonbonded_verlet_t *nbv,
1027                                            const t_inputrec         *inputrec,
1028                                            const int64_t             step)
1029 {
1030     /* We should not launch the rolling pruning kernel at a search
1031      * step or just before search steps, since that's useless.
1032      * Without domain decomposition we prune at even steps.
1033      * With domain decomposition we alternate local and non-local
1034      * pruning at even and odd steps.
1035      */
1036     int  numRollingParts     = nbv->listParams->numRollingParts;
1037     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1038     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1039     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1040     if (stepWithCurrentList > 0 &&
1041         stepWithCurrentList < inputrec->nstlist - 1 &&
1042         (stepIsEven || DOMAINDECOMP(cr)))
1043     {
1044         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1045                                           stepIsEven ? eintLocal : eintNonlocal,
1046                                           numRollingParts);
1047     }
1048 }
1049
1050 static void do_force_cutsVERLET(FILE *fplog,
1051                                 const t_commrec *cr,
1052                                 const gmx_multisim_t *ms,
1053                                 const t_inputrec *inputrec,
1054                                 gmx::Awh *awh,
1055                                 gmx_enfrot *enforcedRotation,
1056                                 int64_t step,
1057                                 t_nrnb *nrnb,
1058                                 gmx_wallcycle_t wcycle,
1059                                 const gmx_localtop_t *top,
1060                                 const gmx_groups_t * /* groups */,
1061                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1062                                 history_t *hist,
1063                                 gmx::PaddedArrayRef<gmx::RVec> force,
1064                                 tensor vir_force,
1065                                 const t_mdatoms *mdatoms,
1066                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1067                                 real *lambda,
1068                                 t_graph *graph,
1069                                 t_forcerec *fr,
1070                                 interaction_const_t *ic,
1071                                 const gmx_vsite_t *vsite,
1072                                 rvec mu_tot,
1073                                 double t,
1074                                 const gmx_edsam *ed,
1075                                 int flags,
1076                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1077                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1078 {
1079     int                 cg1, i, j;
1080     double              mu[2*DIM];
1081     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1082     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1083     rvec                vzero, box_diag;
1084     float               cycles_pme, cycles_wait_gpu;
1085     nonbonded_verlet_t *nbv = fr->nbv;
1086
1087     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1088     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1089     bFillGrid     = (bNS && bStateChanged);
1090     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1091     bDoForces     = (flags & GMX_FORCE_FORCES);
1092     bUseGPU       = fr->nbv->bUseGPU;
1093     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1094
1095     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1096     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1097     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1098         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1099
1100     /* At a search step we need to start the first balancing region
1101      * somewhere early inside the step after communication during domain
1102      * decomposition (and not during the previous step as usual).
1103      */
1104     if (bNS &&
1105         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1106     {
1107         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1108     }
1109
1110     cycles_wait_gpu = 0;
1111
1112     const int start  = 0;
1113     const int homenr = mdatoms->homenr;
1114
1115     clear_mat(vir_force);
1116
1117     if (DOMAINDECOMP(cr))
1118     {
1119         cg1 = cr->dd->globalAtomGroupIndices.size();
1120     }
1121     else
1122     {
1123         cg1 = top->cgs.nr;
1124     }
1125     if (fr->n_tpi > 0)
1126     {
1127         cg1--;
1128     }
1129
1130     if (bStateChanged)
1131     {
1132         update_forcerec(fr, box);
1133
1134         if (inputrecNeedMutot(inputrec))
1135         {
1136             /* Calculate total (local) dipole moment in a temporary common array.
1137              * This makes it possible to sum them over nodes faster.
1138              */
1139             calc_mu(start, homenr,
1140                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1141                     mu, mu+DIM);
1142         }
1143     }
1144
1145     if (fr->ePBC != epbcNONE)
1146     {
1147         /* Compute shift vectors every step,
1148          * because of pressure coupling or box deformation!
1149          */
1150         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1151         {
1152             calc_shifts(box, fr->shift_vec);
1153         }
1154
1155         if (bCalcCGCM)
1156         {
1157             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1158             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1159         }
1160         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1161         {
1162             unshift_self(graph, box, as_rvec_array(x.data()));
1163         }
1164     }
1165
1166     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1167                                  fr->shift_vec, nbv->nbat);
1168
1169 #if GMX_MPI
1170     if (!thisRankHasDuty(cr, DUTY_PME))
1171     {
1172         /* Send particle coordinates to the pme nodes.
1173          * Since this is only implemented for domain decomposition
1174          * and domain decomposition does not use the graph,
1175          * we do not need to worry about shifting.
1176          */
1177         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1178                                  lambda[efptCOUL], lambda[efptVDW],
1179                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1180                                  step, wcycle);
1181     }
1182 #endif /* GMX_MPI */
1183
1184     if (useGpuPme)
1185     {
1186         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1187     }
1188
1189     /* do gridding for pair search */
1190     if (bNS)
1191     {
1192         if (graph && bStateChanged)
1193         {
1194             /* Calculate intramolecular shift vectors to make molecules whole */
1195             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1196         }
1197
1198         clear_rvec(vzero);
1199         box_diag[XX] = box[XX][XX];
1200         box_diag[YY] = box[YY][YY];
1201         box_diag[ZZ] = box[ZZ][ZZ];
1202
1203         wallcycle_start(wcycle, ewcNS);
1204         if (!DOMAINDECOMP(cr))
1205         {
1206             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1207             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1208                               0, vzero, box_diag,
1209                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1210                               0, nullptr,
1211                               nbv->grp[eintLocal].kernel_type,
1212                               nbv->nbat);
1213             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1214         }
1215         else
1216         {
1217             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1218             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1219                                        fr->cginfo, as_rvec_array(x.data()),
1220                                        nbv->grp[eintNonlocal].kernel_type,
1221                                        nbv->nbat);
1222             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1223         }
1224
1225         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1226         wallcycle_stop(wcycle, ewcNS);
1227     }
1228
1229     /* initialize the GPU atom data and copy shift vector */
1230     if (bUseGPU)
1231     {
1232         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1233         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1234
1235         if (bNS)
1236         {
1237             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1238         }
1239
1240         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1241
1242         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1243         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1244     }
1245
1246     /* do local pair search */
1247     if (bNS)
1248     {
1249         wallcycle_start_nocount(wcycle, ewcNS);
1250         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1251         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1252                             &top->excls,
1253                             nbv->listParams->rlistOuter,
1254                             nbv->min_ci_balanced,
1255                             &nbv->grp[eintLocal].nbl_lists,
1256                             eintLocal,
1257                             nbv->grp[eintLocal].kernel_type,
1258                             nrnb);
1259         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1260         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1261         {
1262             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1263         }
1264         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1265
1266         if (bUseGPU)
1267         {
1268             /* initialize local pair-list on the GPU */
1269             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1270                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1271                                     eintLocal);
1272         }
1273         wallcycle_stop(wcycle, ewcNS);
1274     }
1275     else
1276     {
1277         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1278                                         nbv->nbat, wcycle);
1279     }
1280
1281     if (bUseGPU)
1282     {
1283         if (DOMAINDECOMP(cr))
1284         {
1285             ddOpenBalanceRegionGpu(cr->dd);
1286         }
1287
1288         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1289         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1290         /* launch local nonbonded work on GPU */
1291         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1292                      step, nrnb, wcycle);
1293         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1294         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1295     }
1296
1297     if (useGpuPme)
1298     {
1299         // In PME GPU and mixed mode we launch FFT / gather after the
1300         // X copy/transform to allow overlap as well as after the GPU NB
1301         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1302         // the nonbonded kernel.
1303         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1304     }
1305
1306     /* Communicate coordinates and sum dipole if necessary +
1307        do non-local pair search */
1308     if (DOMAINDECOMP(cr))
1309     {
1310         if (bNS)
1311         {
1312             wallcycle_start_nocount(wcycle, ewcNS);
1313             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1314
1315             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1316                                 &top->excls,
1317                                 nbv->listParams->rlistOuter,
1318                                 nbv->min_ci_balanced,
1319                                 &nbv->grp[eintNonlocal].nbl_lists,
1320                                 eintNonlocal,
1321                                 nbv->grp[eintNonlocal].kernel_type,
1322                                 nrnb);
1323             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1324             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1325             {
1326                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1327             }
1328             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1329
1330             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1331             {
1332                 /* initialize non-local pair-list on the GPU */
1333                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1334                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1335                                         eintNonlocal);
1336             }
1337             wallcycle_stop(wcycle, ewcNS);
1338         }
1339         else
1340         {
1341             dd_move_x(cr->dd, box, x, wcycle);
1342
1343             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1344                                             nbv->nbat, wcycle);
1345         }
1346
1347         if (bUseGPU)
1348         {
1349             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1350             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1351             /* launch non-local nonbonded tasks on GPU */
1352             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1353                          step, nrnb, wcycle);
1354             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1355             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1356         }
1357     }
1358
1359     if (bUseGPU)
1360     {
1361         /* launch D2H copy-back F */
1362         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1363         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1364         if (DOMAINDECOMP(cr))
1365         {
1366             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1367                                      flags, eatNonlocal);
1368         }
1369         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1370                                  flags, eatLocal);
1371         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1372         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1373     }
1374
1375     if (bStateChanged && inputrecNeedMutot(inputrec))
1376     {
1377         if (PAR(cr))
1378         {
1379             gmx_sumd(2*DIM, mu, cr);
1380             ddReopenBalanceRegionCpu(cr->dd);
1381         }
1382
1383         for (i = 0; i < 2; i++)
1384         {
1385             for (j = 0; j < DIM; j++)
1386             {
1387                 fr->mu_tot[i][j] = mu[i*DIM + j];
1388             }
1389         }
1390     }
1391     if (fr->efep == efepNO)
1392     {
1393         copy_rvec(fr->mu_tot[0], mu_tot);
1394     }
1395     else
1396     {
1397         for (j = 0; j < DIM; j++)
1398         {
1399             mu_tot[j] =
1400                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1401                 lambda[efptCOUL]*fr->mu_tot[1][j];
1402         }
1403     }
1404
1405     /* Reset energies */
1406     reset_enerdata(enerd);
1407     clear_rvecs(SHIFTS, fr->fshift);
1408
1409     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1410     {
1411         wallcycle_start(wcycle, ewcPPDURINGPME);
1412         dd_force_flop_start(cr->dd, nrnb);
1413     }
1414
1415     if (inputrec->bRot)
1416     {
1417         wallcycle_start(wcycle, ewcROT);
1418         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1419         wallcycle_stop(wcycle, ewcROT);
1420     }
1421
1422     /* Temporary solution until all routines take PaddedRVecVector */
1423     rvec *const f = as_rvec_array(force.data());
1424
1425     /* Start the force cycle counter.
1426      * Note that a different counter is used for dynamic load balancing.
1427      */
1428     wallcycle_start(wcycle, ewcFORCE);
1429
1430     gmx::ArrayRef<gmx::RVec> forceRef = force;
1431     if (bDoForces)
1432     {
1433         /* If we need to compute the virial, we might need a separate
1434          * force buffer for algorithms for which the virial is calculated
1435          * directly, such as PME.
1436          */
1437         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1438         {
1439             forceRef = *fr->forceBufferForDirectVirialContributions;
1440
1441             /* TODO: update comment
1442              * We only compute forces on local atoms. Note that vsites can
1443              * spread to non-local atoms, but that part of the buffer is
1444              * cleared separately in the vsite spreading code.
1445              */
1446             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1447         }
1448         /* Clear the short- and long-range forces */
1449         clear_rvecs_omp(fr->natoms_force_constr, f);
1450     }
1451
1452     /* forceWithVirial uses the local atom range only */
1453     gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1454
1455     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1456     {
1457         clear_pull_forces(inputrec->pull_work);
1458     }
1459
1460     /* We calculate the non-bonded forces, when done on the CPU, here.
1461      * We do this before calling do_force_lowlevel, because in that
1462      * function, the listed forces are calculated before PME, which
1463      * does communication.  With this order, non-bonded and listed
1464      * force calculation imbalance can be balanced out by the domain
1465      * decomposition load balancing.
1466      */
1467
1468     if (!bUseOrEmulGPU)
1469     {
1470         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1471                      step, nrnb, wcycle);
1472     }
1473
1474     if (fr->efep != efepNO)
1475     {
1476         /* Calculate the local and non-local free energy interactions here.
1477          * Happens here on the CPU both with and without GPU.
1478          */
1479         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1480         {
1481             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1482                              fr, as_rvec_array(x.data()), f, mdatoms,
1483                              inputrec->fepvals, lambda,
1484                              enerd, flags, nrnb, wcycle);
1485         }
1486
1487         if (DOMAINDECOMP(cr) &&
1488             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1489         {
1490             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1491                              fr, as_rvec_array(x.data()), f, mdatoms,
1492                              inputrec->fepvals, lambda,
1493                              enerd, flags, nrnb, wcycle);
1494         }
1495     }
1496
1497     if (!bUseOrEmulGPU)
1498     {
1499         int aloc;
1500
1501         if (DOMAINDECOMP(cr))
1502         {
1503             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1504                          step, nrnb, wcycle);
1505         }
1506
1507         if (!bUseOrEmulGPU)
1508         {
1509             aloc = eintLocal;
1510         }
1511         else
1512         {
1513             aloc = eintNonlocal;
1514         }
1515
1516         /* Add all the non-bonded force to the normal force array.
1517          * This can be split into a local and a non-local part when overlapping
1518          * communication with calculation with domain decomposition.
1519          */
1520         wallcycle_stop(wcycle, ewcFORCE);
1521
1522         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1523
1524         wallcycle_start_nocount(wcycle, ewcFORCE);
1525
1526         /* if there are multiple fshift output buffers reduce them */
1527         if ((flags & GMX_FORCE_VIRIAL) &&
1528             nbv->grp[aloc].nbl_lists.nnbl > 1)
1529         {
1530             /* This is not in a subcounter because it takes a
1531                negligible and constant-sized amount of time */
1532             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1533                                                      fr->fshift);
1534         }
1535     }
1536
1537     /* update QMMMrec, if necessary */
1538     if (fr->bQMMM)
1539     {
1540         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1541     }
1542
1543     /* Compute the bonded and non-bonded energies and optionally forces */
1544     do_force_lowlevel(fr, inputrec, &(top->idef),
1545                       cr, ms, nrnb, wcycle, mdatoms,
1546                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1547                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1548                       flags, &cycles_pme);
1549
1550     wallcycle_stop(wcycle, ewcFORCE);
1551
1552     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1553                          step, t, wcycle,
1554                          fr->forceProviders, box, x, mdatoms, lambda,
1555                          flags, &forceWithVirial, enerd,
1556                          ed, bNS);
1557
1558     if (bUseOrEmulGPU)
1559     {
1560         /* wait for non-local forces (or calculate in emulation mode) */
1561         if (DOMAINDECOMP(cr))
1562         {
1563             if (bUseGPU)
1564             {
1565                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1566                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1567                                            flags, eatNonlocal,
1568                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1569                                            fr->fshift);
1570                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1571             }
1572             else
1573             {
1574                 wallcycle_start_nocount(wcycle, ewcFORCE);
1575                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1576                              step, nrnb, wcycle);
1577                 wallcycle_stop(wcycle, ewcFORCE);
1578             }
1579
1580             /* skip the reduction if there was no non-local work to do */
1581             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1582             {
1583                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1584                                                nbv->nbat, f, wcycle);
1585             }
1586         }
1587     }
1588
1589     if (DOMAINDECOMP(cr))
1590     {
1591         /* We are done with the CPU compute.
1592          * We will now communicate the non-local forces.
1593          * If we use a GPU this will overlap with GPU work, so in that case
1594          * we do not close the DD force balancing region here.
1595          */
1596         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1597         {
1598             ddCloseBalanceRegionCpu(cr->dd);
1599         }
1600         if (bDoForces)
1601         {
1602             dd_move_f(cr->dd, force, fr->fshift, wcycle);
1603         }
1604     }
1605
1606     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1607     // an alternating wait/reduction scheme.
1608     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1609     if (alternateGpuWait)
1610     {
1611         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1612     }
1613
1614     if (!alternateGpuWait && useGpuPme)
1615     {
1616         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1617         matrix vir_Q;
1618         real   Vlr_q = 0.0;
1619         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1620         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1621     }
1622
1623     /* Wait for local GPU NB outputs on the non-alternating wait path */
1624     if (!alternateGpuWait && bUseGPU)
1625     {
1626         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1627          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1628          * but even with a step of 0.1 ms the difference is less than 1%
1629          * of the step time.
1630          */
1631         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1632
1633         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1634         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1635                                    flags, eatLocal,
1636                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1637                                    fr->fshift);
1638         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1639
1640         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1641         {
1642             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1643             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1644             {
1645                 /* We measured few cycles, it could be that the kernel
1646                  * and transfer finished earlier and there was no actual
1647                  * wait time, only API call overhead.
1648                  * Then the actual time could be anywhere between 0 and
1649                  * cycles_wait_est. We will use half of cycles_wait_est.
1650                  */
1651                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1652             }
1653             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1654         }
1655     }
1656
1657     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1658     {
1659         // NOTE: emulation kernel is not included in the balancing region,
1660         // but emulation mode does not target performance anyway
1661         wallcycle_start_nocount(wcycle, ewcFORCE);
1662         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1663                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1664                      step, nrnb, wcycle);
1665         wallcycle_stop(wcycle, ewcFORCE);
1666     }
1667
1668     if (useGpuPme)
1669     {
1670         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1671     }
1672
1673     if (bUseGPU)
1674     {
1675         /* now clear the GPU outputs while we finish the step on the CPU */
1676         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1677         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1678         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1679
1680         /* Is dynamic pair-list pruning activated? */
1681         if (nbv->listParams->useDynamicPruning)
1682         {
1683             launchGpuRollingPruning(cr, nbv, inputrec, step);
1684         }
1685         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1686         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1687     }
1688
1689     /* Do the nonbonded GPU (or emulation) force buffer reduction
1690      * on the non-alternating path. */
1691     if (bUseOrEmulGPU && !alternateGpuWait)
1692     {
1693         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1694                                        nbv->nbat, f, wcycle);
1695     }
1696
1697     if (DOMAINDECOMP(cr))
1698     {
1699         dd_force_flop_stop(cr->dd, nrnb);
1700     }
1701
1702     if (bDoForces)
1703     {
1704         /* If we have NoVirSum forces, but we do not calculate the virial,
1705          * we sum fr->f_novirsum=f later.
1706          */
1707         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1708         {
1709             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1710                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1711         }
1712
1713         if (flags & GMX_FORCE_VIRIAL)
1714         {
1715             /* Calculation of the virial must be done after vsites! */
1716             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1717                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1718         }
1719     }
1720
1721     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1722     {
1723         /* In case of node-splitting, the PP nodes receive the long-range
1724          * forces, virial and energy from the PME nodes here.
1725          */
1726         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1727     }
1728
1729     if (bDoForces)
1730     {
1731         post_process_forces(cr, step, nrnb, wcycle,
1732                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1733                             vir_force, mdatoms, graph, fr, vsite,
1734                             flags);
1735     }
1736
1737     if (flags & GMX_FORCE_ENERGY)
1738     {
1739         /* Sum the potential energy terms from group contributions */
1740         sum_epot(&(enerd->grpp), enerd->term);
1741
1742         if (!EI_TPI(inputrec->eI))
1743         {
1744             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1745         }
1746     }
1747 }
1748
1749 static void do_force_cutsGROUP(FILE *fplog,
1750                                const t_commrec *cr,
1751                                const gmx_multisim_t *ms,
1752                                const t_inputrec *inputrec,
1753                                gmx::Awh *awh,
1754                                gmx_enfrot *enforcedRotation,
1755                                int64_t step,
1756                                t_nrnb *nrnb,
1757                                gmx_wallcycle_t wcycle,
1758                                gmx_localtop_t *top,
1759                                const gmx_groups_t *groups,
1760                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1761                                history_t *hist,
1762                                gmx::PaddedArrayRef<gmx::RVec> force,
1763                                tensor vir_force,
1764                                const t_mdatoms *mdatoms,
1765                                gmx_enerdata_t *enerd,
1766                                t_fcdata *fcd,
1767                                real *lambda,
1768                                t_graph *graph,
1769                                t_forcerec *fr,
1770                                const gmx_vsite_t *vsite,
1771                                rvec mu_tot,
1772                                double t,
1773                                const gmx_edsam *ed,
1774                                int flags,
1775                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1776                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1777 {
1778     int        cg0, cg1, i, j;
1779     double     mu[2*DIM];
1780     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1781     gmx_bool   bDoForces;
1782     float      cycles_pme;
1783
1784     const int  start  = 0;
1785     const int  homenr = mdatoms->homenr;
1786
1787     clear_mat(vir_force);
1788
1789     cg0 = 0;
1790     if (DOMAINDECOMP(cr))
1791     {
1792         cg1 = cr->dd->globalAtomGroupIndices.size();
1793     }
1794     else
1795     {
1796         cg1 = top->cgs.nr;
1797     }
1798     if (fr->n_tpi > 0)
1799     {
1800         cg1--;
1801     }
1802
1803     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1804     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1805     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1806     bFillGrid      = (bNS && bStateChanged);
1807     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1808     bDoForces      = (flags & GMX_FORCE_FORCES);
1809
1810     if (bStateChanged)
1811     {
1812         update_forcerec(fr, box);
1813
1814         if (inputrecNeedMutot(inputrec))
1815         {
1816             /* Calculate total (local) dipole moment in a temporary common array.
1817              * This makes it possible to sum them over nodes faster.
1818              */
1819             calc_mu(start, homenr,
1820                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1821                     mu, mu+DIM);
1822         }
1823     }
1824
1825     if (fr->ePBC != epbcNONE)
1826     {
1827         /* Compute shift vectors every step,
1828          * because of pressure coupling or box deformation!
1829          */
1830         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1831         {
1832             calc_shifts(box, fr->shift_vec);
1833         }
1834
1835         if (bCalcCGCM)
1836         {
1837             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1838                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1839             inc_nrnb(nrnb, eNR_CGCM, homenr);
1840             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1841         }
1842         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1843         {
1844             unshift_self(graph, box, as_rvec_array(x.data()));
1845         }
1846     }
1847     else if (bCalcCGCM)
1848     {
1849         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1850         inc_nrnb(nrnb, eNR_CGCM, homenr);
1851     }
1852
1853     if (bCalcCGCM && gmx_debug_at)
1854     {
1855         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1856     }
1857
1858 #if GMX_MPI
1859     if (!thisRankHasDuty(cr, DUTY_PME))
1860     {
1861         /* Send particle coordinates to the pme nodes.
1862          * Since this is only implemented for domain decomposition
1863          * and domain decomposition does not use the graph,
1864          * we do not need to worry about shifting.
1865          */
1866         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1867                                  lambda[efptCOUL], lambda[efptVDW],
1868                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1869                                  step, wcycle);
1870     }
1871 #endif /* GMX_MPI */
1872
1873     /* Communicate coordinates and sum dipole if necessary */
1874     if (DOMAINDECOMP(cr))
1875     {
1876         dd_move_x(cr->dd, box, x, wcycle);
1877
1878         /* No GPU support, no move_x overlap, so reopen the balance region here */
1879         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1880         {
1881             ddReopenBalanceRegionCpu(cr->dd);
1882         }
1883     }
1884
1885     if (inputrecNeedMutot(inputrec))
1886     {
1887         if (bStateChanged)
1888         {
1889             if (PAR(cr))
1890             {
1891                 gmx_sumd(2*DIM, mu, cr);
1892                 ddReopenBalanceRegionCpu(cr->dd);
1893             }
1894             for (i = 0; i < 2; i++)
1895             {
1896                 for (j = 0; j < DIM; j++)
1897                 {
1898                     fr->mu_tot[i][j] = mu[i*DIM + j];
1899                 }
1900             }
1901         }
1902         if (fr->efep == efepNO)
1903         {
1904             copy_rvec(fr->mu_tot[0], mu_tot);
1905         }
1906         else
1907         {
1908             for (j = 0; j < DIM; j++)
1909             {
1910                 mu_tot[j] =
1911                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1912             }
1913         }
1914     }
1915
1916     /* Reset energies */
1917     reset_enerdata(enerd);
1918     clear_rvecs(SHIFTS, fr->fshift);
1919
1920     if (bNS)
1921     {
1922         wallcycle_start(wcycle, ewcNS);
1923
1924         if (graph && bStateChanged)
1925         {
1926             /* Calculate intramolecular shift vectors to make molecules whole */
1927             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1928         }
1929
1930         /* Do the actual neighbour searching */
1931         ns(fplog, fr, box,
1932            groups, top, mdatoms,
1933            cr, nrnb, bFillGrid);
1934
1935         wallcycle_stop(wcycle, ewcNS);
1936     }
1937
1938     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1939     {
1940         wallcycle_start(wcycle, ewcPPDURINGPME);
1941         dd_force_flop_start(cr->dd, nrnb);
1942     }
1943
1944     if (inputrec->bRot)
1945     {
1946         wallcycle_start(wcycle, ewcROT);
1947         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1948         wallcycle_stop(wcycle, ewcROT);
1949     }
1950
1951     /* Temporary solution until all routines take PaddedRVecVector */
1952     rvec *f = as_rvec_array(force.data());
1953
1954     /* Start the force cycle counter.
1955      * Note that a different counter is used for dynamic load balancing.
1956      */
1957     wallcycle_start(wcycle, ewcFORCE);
1958
1959     gmx::ArrayRef<gmx::RVec> forceRef = force;
1960     if (bDoForces)
1961     {
1962         /* If we need to compute the virial, we might need a separate
1963          * force buffer for algorithms for which the virial is calculated
1964          * separately, such as PME.
1965          */
1966         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1967         {
1968             forceRef = *fr->forceBufferForDirectVirialContributions;
1969
1970             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1971         }
1972
1973         /* Clear the short- and long-range forces */
1974         clear_rvecs(fr->natoms_force_constr, f);
1975     }
1976
1977     /* forceWithVirial might need the full force atom range */
1978     gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1979
1980     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1981     {
1982         clear_pull_forces(inputrec->pull_work);
1983     }
1984
1985     /* update QMMMrec, if necessary */
1986     if (fr->bQMMM)
1987     {
1988         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1989     }
1990
1991     /* Compute the bonded and non-bonded energies and optionally forces */
1992     do_force_lowlevel(fr, inputrec, &(top->idef),
1993                       cr, ms, nrnb, wcycle, mdatoms,
1994                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1995                       box, inputrec->fepvals, lambda,
1996                       graph, &(top->excls), fr->mu_tot,
1997                       flags,
1998                       &cycles_pme);
1999
2000     wallcycle_stop(wcycle, ewcFORCE);
2001
2002     if (DOMAINDECOMP(cr))
2003     {
2004         dd_force_flop_stop(cr->dd, nrnb);
2005
2006         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2007         {
2008             ddCloseBalanceRegionCpu(cr->dd);
2009         }
2010     }
2011
2012     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2013                          step, t, wcycle,
2014                          fr->forceProviders, box, x, mdatoms, lambda,
2015                          flags, &forceWithVirial, enerd,
2016                          ed, bNS);
2017
2018     if (bDoForces)
2019     {
2020         /* Communicate the forces */
2021         if (DOMAINDECOMP(cr))
2022         {
2023             dd_move_f(cr->dd, force, fr->fshift, wcycle);
2024             /* Do we need to communicate the separate force array
2025              * for terms that do not contribute to the single sum virial?
2026              * Position restraints and electric fields do not introduce
2027              * inter-cg forces, only full electrostatics methods do.
2028              * When we do not calculate the virial, fr->f_novirsum = f,
2029              * so we have already communicated these forces.
2030              */
2031             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2032                 (flags & GMX_FORCE_VIRIAL))
2033             {
2034                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2035             }
2036         }
2037
2038         /* If we have NoVirSum forces, but we do not calculate the virial,
2039          * we sum fr->f_novirsum=f later.
2040          */
2041         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2042         {
2043             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2044                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2045         }
2046
2047         if (flags & GMX_FORCE_VIRIAL)
2048         {
2049             /* Calculation of the virial must be done after vsites! */
2050             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2051                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2052         }
2053     }
2054
2055     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2056     {
2057         /* In case of node-splitting, the PP nodes receive the long-range
2058          * forces, virial and energy from the PME nodes here.
2059          */
2060         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2061     }
2062
2063     if (bDoForces)
2064     {
2065         post_process_forces(cr, step, nrnb, wcycle,
2066                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2067                             vir_force, mdatoms, graph, fr, vsite,
2068                             flags);
2069     }
2070
2071     if (flags & GMX_FORCE_ENERGY)
2072     {
2073         /* Sum the potential energy terms from group contributions */
2074         sum_epot(&(enerd->grpp), enerd->term);
2075
2076         if (!EI_TPI(inputrec->eI))
2077         {
2078             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2079         }
2080     }
2081
2082 }
2083
2084 void do_force(FILE                                     *fplog,
2085               const t_commrec                          *cr,
2086               const gmx_multisim_t                     *ms,
2087               const t_inputrec                         *inputrec,
2088               gmx::Awh                                 *awh,
2089               gmx_enfrot                               *enforcedRotation,
2090               int64_t                                   step,
2091               t_nrnb                                   *nrnb,
2092               gmx_wallcycle_t                           wcycle,
2093               gmx_localtop_t                           *top,
2094               const gmx_groups_t                       *groups,
2095               matrix                                    box,
2096               gmx::PaddedArrayRef<gmx::RVec>            x,
2097               history_t                                *hist,
2098               gmx::PaddedArrayRef<gmx::RVec>            force,
2099               tensor                                    vir_force,
2100               const t_mdatoms                          *mdatoms,
2101               gmx_enerdata_t                           *enerd,
2102               t_fcdata                                 *fcd,
2103               gmx::ArrayRef<real>                       lambda,
2104               t_graph                                  *graph,
2105               t_forcerec                               *fr,
2106               const gmx_vsite_t                        *vsite,
2107               rvec                                      mu_tot,
2108               double                                    t,
2109               const gmx_edsam                          *ed,
2110               int                                       flags,
2111               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2112               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2113 {
2114     /* modify force flag if not doing nonbonded */
2115     if (!fr->bNonbonded)
2116     {
2117         flags &= ~GMX_FORCE_NONBONDED;
2118     }
2119
2120     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2121     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2122
2123     switch (inputrec->cutoff_scheme)
2124     {
2125         case ecutsVERLET:
2126             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2127                                 awh, enforcedRotation, step, nrnb, wcycle,
2128                                 top,
2129                                 groups,
2130                                 box, x, hist,
2131                                 force, vir_force,
2132                                 mdatoms,
2133                                 enerd, fcd,
2134                                 lambda.data(), graph,
2135                                 fr, fr->ic,
2136                                 vsite, mu_tot,
2137                                 t, ed,
2138                                 flags,
2139                                 ddOpenBalanceRegion,
2140                                 ddCloseBalanceRegion);
2141             break;
2142         case ecutsGROUP:
2143             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2144                                awh, enforcedRotation, step, nrnb, wcycle,
2145                                top,
2146                                groups,
2147                                box, x, hist,
2148                                force, vir_force,
2149                                mdatoms,
2150                                enerd, fcd,
2151                                lambda.data(), graph,
2152                                fr, vsite, mu_tot,
2153                                t, ed,
2154                                flags,
2155                                ddOpenBalanceRegion,
2156                                ddCloseBalanceRegion);
2157             break;
2158         default:
2159             gmx_incons("Invalid cut-off scheme passed!");
2160     }
2161
2162     /* In case we don't have constraints and are using GPUs, the next balancing
2163      * region starts here.
2164      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2165      * virial calculation and COM pulling, is not thus not included in
2166      * the balance timing, which is ok as most tasks do communication.
2167      */
2168     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2169     {
2170         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2171     }
2172 }
2173
2174
2175 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2176                         t_inputrec *ir, t_mdatoms *md,
2177                         t_state *state)
2178 {
2179     int             i, m, start, end;
2180     int64_t         step;
2181     real            dt = ir->delta_t;
2182     real            dvdl_dum;
2183     rvec           *savex;
2184
2185     /* We need to allocate one element extra, since we might use
2186      * (unaligned) 4-wide SIMD loads to access rvec entries.
2187      */
2188     snew(savex, state->natoms + 1);
2189
2190     start = 0;
2191     end   = md->homenr;
2192
2193     if (debug)
2194     {
2195         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2196                 start, md->homenr, end);
2197     }
2198     /* Do a first constrain to reset particles... */
2199     step = ir->init_step;
2200     if (fplog)
2201     {
2202         char buf[STEPSTRSIZE];
2203         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2204                 gmx_step_str(step, buf));
2205     }
2206     dvdl_dum = 0;
2207
2208     /* constrain the current position */
2209     constr->apply(TRUE, FALSE,
2210                   step, 0, 1.0,
2211                   as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2212                   state->box,
2213                   state->lambda[efptBONDED], &dvdl_dum,
2214                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2215     if (EI_VV(ir->eI))
2216     {
2217         /* constrain the inital velocity, and save it */
2218         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2219         constr->apply(TRUE, FALSE,
2220                       step, 0, 1.0,
2221                       as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2222                       state->box,
2223                       state->lambda[efptBONDED], &dvdl_dum,
2224                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2225     }
2226     /* constrain the inital velocities at t-dt/2 */
2227     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2228     {
2229         for (i = start; (i < end); i++)
2230         {
2231             for (m = 0; (m < DIM); m++)
2232             {
2233                 /* Reverse the velocity */
2234                 state->v[i][m] = -state->v[i][m];
2235                 /* Store the position at t-dt in buf */
2236                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2237             }
2238         }
2239         /* Shake the positions at t=-dt with the positions at t=0
2240          * as reference coordinates.
2241          */
2242         if (fplog)
2243         {
2244             char buf[STEPSTRSIZE];
2245             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2246                     gmx_step_str(step, buf));
2247         }
2248         dvdl_dum = 0;
2249         constr->apply(TRUE, FALSE,
2250                       step, -1, 1.0,
2251                       as_rvec_array(state->x.data()), savex, nullptr,
2252                       state->box,
2253                       state->lambda[efptBONDED], &dvdl_dum,
2254                       as_rvec_array(state->v.data()), nullptr, gmx::ConstraintVariable::Positions);
2255
2256         for (i = start; i < end; i++)
2257         {
2258             for (m = 0; m < DIM; m++)
2259             {
2260                 /* Re-reverse the velocities */
2261                 state->v[i][m] = -state->v[i][m];
2262             }
2263         }
2264     }
2265     sfree(savex);
2266 }
2267
2268
2269 static void
2270 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2271                 double *enerout, double *virout)
2272 {
2273     double enersum, virsum;
2274     double invscale, invscale2, invscale3;
2275     double r, ea, eb, ec, pa, pb, pc, pd;
2276     double y0, f, g, h;
2277     int    ri, offset;
2278     double tabfactor;
2279
2280     invscale  = 1.0/scale;
2281     invscale2 = invscale*invscale;
2282     invscale3 = invscale*invscale2;
2283
2284     /* Following summation derived from cubic spline definition,
2285      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2286      * the cubic spline.  We first calculate the negative of the
2287      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2288      * add the more standard, abrupt cutoff correction to that result,
2289      * yielding the long-range correction for a switched function.  We
2290      * perform both the pressure and energy loops at the same time for
2291      * simplicity, as the computational cost is low. */
2292
2293     if (offstart == 0)
2294     {
2295         /* Since the dispersion table has been scaled down a factor
2296          * 6.0 and the repulsion a factor 12.0 to compensate for the
2297          * c6/c12 parameters inside nbfp[] being scaled up (to save
2298          * flops in kernels), we need to correct for this.
2299          */
2300         tabfactor = 6.0;
2301     }
2302     else
2303     {
2304         tabfactor = 12.0;
2305     }
2306
2307     enersum = 0.0;
2308     virsum  = 0.0;
2309     for (ri = rstart; ri < rend; ++ri)
2310     {
2311         r  = ri*invscale;
2312         ea = invscale3;
2313         eb = 2.0*invscale2*r;
2314         ec = invscale*r*r;
2315
2316         pa = invscale3;
2317         pb = 3.0*invscale2*r;
2318         pc = 3.0*invscale*r*r;
2319         pd = r*r*r;
2320
2321         /* this "8" is from the packing in the vdwtab array - perhaps
2322            should be defined? */
2323
2324         offset = 8*ri + offstart;
2325         y0     = vdwtab[offset];
2326         f      = vdwtab[offset+1];
2327         g      = vdwtab[offset+2];
2328         h      = vdwtab[offset+3];
2329
2330         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2331         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2332     }
2333     *enerout = 4.0*M_PI*enersum*tabfactor;
2334     *virout  = 4.0*M_PI*virsum*tabfactor;
2335 }
2336
2337 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2338 {
2339     double   eners[2], virs[2], enersum, virsum;
2340     double   r0, rc3, rc9;
2341     int      ri0, ri1, i;
2342     real     scale, *vdwtab;
2343
2344     fr->enershiftsix    = 0;
2345     fr->enershifttwelve = 0;
2346     fr->enerdiffsix     = 0;
2347     fr->enerdifftwelve  = 0;
2348     fr->virdiffsix      = 0;
2349     fr->virdifftwelve   = 0;
2350
2351     const interaction_const_t *ic = fr->ic;
2352
2353     if (eDispCorr != edispcNO)
2354     {
2355         for (i = 0; i < 2; i++)
2356         {
2357             eners[i] = 0;
2358             virs[i]  = 0;
2359         }
2360         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2361             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2362             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2363             (ic->vdwtype == evdwSHIFT) ||
2364             (ic->vdwtype == evdwSWITCH))
2365         {
2366             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2367                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2368                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2369             {
2370                 gmx_fatal(FARGS,
2371                           "With dispersion correction rvdw-switch can not be zero "
2372                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2373             }
2374
2375             /* TODO This code depends on the logic in tables.c that
2376                constructs the table layout, which should be made
2377                explicit in future cleanup. */
2378             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2379                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2380             scale  = fr->dispersionCorrectionTable->scale;
2381             vdwtab = fr->dispersionCorrectionTable->data;
2382
2383             /* Round the cut-offs to exact table values for precision */
2384             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2385             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2386
2387             /* The code below has some support for handling force-switching, i.e.
2388              * when the force (instead of potential) is switched over a limited
2389              * region. This leads to a constant shift in the potential inside the
2390              * switching region, which we can handle by adding a constant energy
2391              * term in the force-switch case just like when we do potential-shift.
2392              *
2393              * For now this is not enabled, but to keep the functionality in the
2394              * code we check separately for switch and shift. When we do force-switch
2395              * the shifting point is rvdw_switch, while it is the cutoff when we
2396              * have a classical potential-shift.
2397              *
2398              * For a pure potential-shift the potential has a constant shift
2399              * all the way out to the cutoff, and that is it. For other forms
2400              * we need to calculate the constant shift up to the point where we
2401              * start modifying the potential.
2402              */
2403             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2404
2405             r0   = ri0/scale;
2406             rc3  = r0*r0*r0;
2407             rc9  = rc3*rc3*rc3;
2408
2409             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2410                 (ic->vdwtype == evdwSHIFT))
2411             {
2412                 /* Determine the constant energy shift below rvdw_switch.
2413                  * Table has a scale factor since we have scaled it down to compensate
2414                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2415                  */
2416                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2417                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2418             }
2419             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2420             {
2421                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2422                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2423             }
2424
2425             /* Add the constant part from 0 to rvdw_switch.
2426              * This integration from 0 to rvdw_switch overcounts the number
2427              * of interactions by 1, as it also counts the self interaction.
2428              * We will correct for this later.
2429              */
2430             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2431             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2432
2433             /* Calculate the contribution in the range [r0,r1] where we
2434              * modify the potential. For a pure potential-shift modifier we will
2435              * have ri0==ri1, and there will not be any contribution here.
2436              */
2437             for (i = 0; i < 2; i++)
2438             {
2439                 enersum = 0;
2440                 virsum  = 0;
2441                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2442                 eners[i] -= enersum;
2443                 virs[i]  -= virsum;
2444             }
2445
2446             /* Alright: Above we compensated by REMOVING the parts outside r0
2447              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2448              *
2449              * Regardless of whether r0 is the point where we start switching,
2450              * or the cutoff where we calculated the constant shift, we include
2451              * all the parts we are missing out to infinity from r0 by
2452              * calculating the analytical dispersion correction.
2453              */
2454             eners[0] += -4.0*M_PI/(3.0*rc3);
2455             eners[1] +=  4.0*M_PI/(9.0*rc9);
2456             virs[0]  +=  8.0*M_PI/rc3;
2457             virs[1]  += -16.0*M_PI/(3.0*rc9);
2458         }
2459         else if (ic->vdwtype == evdwCUT ||
2460                  EVDW_PME(ic->vdwtype) ||
2461                  ic->vdwtype == evdwUSER)
2462         {
2463             if (ic->vdwtype == evdwUSER && fplog)
2464             {
2465                 fprintf(fplog,
2466                         "WARNING: using dispersion correction with user tables\n");
2467             }
2468
2469             /* Note that with LJ-PME, the dispersion correction is multiplied
2470              * by the difference between the actual C6 and the value of C6
2471              * that would produce the combination rule.
2472              * This means the normal energy and virial difference formulas
2473              * can be used here.
2474              */
2475
2476             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2477             rc9  = rc3*rc3*rc3;
2478             /* Contribution beyond the cut-off */
2479             eners[0] += -4.0*M_PI/(3.0*rc3);
2480             eners[1] +=  4.0*M_PI/(9.0*rc9);
2481             if (ic->vdw_modifier == eintmodPOTSHIFT)
2482             {
2483                 /* Contribution within the cut-off */
2484                 eners[0] += -4.0*M_PI/(3.0*rc3);
2485                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2486             }
2487             /* Contribution beyond the cut-off */
2488             virs[0]  +=  8.0*M_PI/rc3;
2489             virs[1]  += -16.0*M_PI/(3.0*rc9);
2490         }
2491         else
2492         {
2493             gmx_fatal(FARGS,
2494                       "Dispersion correction is not implemented for vdw-type = %s",
2495                       evdw_names[ic->vdwtype]);
2496         }
2497
2498         /* When we deprecate the group kernels the code below can go too */
2499         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2500         {
2501             /* Calculate self-interaction coefficient (assuming that
2502              * the reciprocal-space contribution is constant in the
2503              * region that contributes to the self-interaction).
2504              */
2505             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2506
2507             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2508             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2509         }
2510
2511         fr->enerdiffsix    = eners[0];
2512         fr->enerdifftwelve = eners[1];
2513         /* The 0.5 is due to the Gromacs definition of the virial */
2514         fr->virdiffsix     = 0.5*virs[0];
2515         fr->virdifftwelve  = 0.5*virs[1];
2516     }
2517 }
2518
2519 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2520                    matrix box, real lambda, tensor pres, tensor virial,
2521                    real *prescorr, real *enercorr, real *dvdlcorr)
2522 {
2523     gmx_bool bCorrAll, bCorrPres;
2524     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2525     int      m;
2526
2527     *prescorr = 0;
2528     *enercorr = 0;
2529     *dvdlcorr = 0;
2530
2531     clear_mat(virial);
2532     clear_mat(pres);
2533
2534     if (ir->eDispCorr != edispcNO)
2535     {
2536         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2537                      ir->eDispCorr == edispcAllEnerPres);
2538         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2539                      ir->eDispCorr == edispcAllEnerPres);
2540
2541         invvol = 1/det(box);
2542         if (fr->n_tpi)
2543         {
2544             /* Only correct for the interactions with the inserted molecule */
2545             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2546             ninter = fr->n_tpi;
2547         }
2548         else
2549         {
2550             dens   = fr->numAtomsForDispersionCorrection*invvol;
2551             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2552         }
2553
2554         if (ir->efep == efepNO)
2555         {
2556             avcsix    = fr->avcsix[0];
2557             avctwelve = fr->avctwelve[0];
2558         }
2559         else
2560         {
2561             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2562             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2563         }
2564
2565         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2566         *enercorr += avcsix*enerdiff;
2567         dvdlambda  = 0.0;
2568         if (ir->efep != efepNO)
2569         {
2570             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2571         }
2572         if (bCorrAll)
2573         {
2574             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2575             *enercorr += avctwelve*enerdiff;
2576             if (fr->efep != efepNO)
2577             {
2578                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2579             }
2580         }
2581
2582         if (bCorrPres)
2583         {
2584             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2585             if (ir->eDispCorr == edispcAllEnerPres)
2586             {
2587                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2588             }
2589             /* The factor 2 is because of the Gromacs virial definition */
2590             spres = -2.0*invvol*svir*PRESFAC;
2591
2592             for (m = 0; m < DIM; m++)
2593             {
2594                 virial[m][m] += svir;
2595                 pres[m][m]   += spres;
2596             }
2597             *prescorr += spres;
2598         }
2599
2600         /* Can't currently control when it prints, for now, just print when degugging */
2601         if (debug)
2602         {
2603             if (bCorrAll)
2604             {
2605                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2606                         avcsix, avctwelve);
2607             }
2608             if (bCorrPres)
2609             {
2610                 fprintf(debug,
2611                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2612                         *enercorr, spres, svir);
2613             }
2614             else
2615             {
2616                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2617             }
2618         }
2619
2620         if (fr->efep != efepNO)
2621         {
2622             *dvdlcorr += dvdlambda;
2623         }
2624     }
2625 }
2626
2627 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2628                             const gmx_mtop_t *mtop, rvec x[],
2629                             gmx_bool bFirst)
2630 {
2631     t_graph        *graph;
2632     int             as, mol;
2633
2634     if (bFirst && fplog)
2635     {
2636         fprintf(fplog, "Removing pbc first time\n");
2637     }
2638
2639     snew(graph, 1);
2640     as = 0;
2641     for (const gmx_molblock_t &molb : mtop->molblock)
2642     {
2643         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2644         if (moltype.atoms.nr == 1 ||
2645             (!bFirst && moltype.cgs.nr == 1))
2646         {
2647             /* Just one atom or charge group in the molecule, no PBC required */
2648             as += molb.nmol*moltype.atoms.nr;
2649         }
2650         else
2651         {
2652             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2653             mk_graph_ilist(nullptr, moltype.ilist,
2654                            0, moltype.atoms.nr, FALSE, FALSE, graph);
2655
2656             for (mol = 0; mol < molb.nmol; mol++)
2657             {
2658                 mk_mshift(fplog, graph, ePBC, box, x+as);
2659
2660                 shift_self(graph, box, x+as);
2661                 /* The molecule is whole now.
2662                  * We don't need the second mk_mshift call as in do_pbc_first,
2663                  * since we no longer need this graph.
2664                  */
2665
2666                 as += moltype.atoms.nr;
2667             }
2668             done_graph(graph);
2669         }
2670     }
2671     sfree(graph);
2672 }
2673
2674 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2675                        const gmx_mtop_t *mtop, rvec x[])
2676 {
2677     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2678 }
2679
2680 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2681                  const gmx_mtop_t *mtop, rvec x[])
2682 {
2683     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2684 }
2685
2686 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2687 {
2688     int t, nth;
2689     nth = gmx_omp_nthreads_get(emntDefault);
2690
2691 #pragma omp parallel for num_threads(nth) schedule(static)
2692     for (t = 0; t < nth; t++)
2693     {
2694         try
2695         {
2696             size_t natoms = x.size();
2697             size_t offset = (natoms*t    )/nth;
2698             size_t len    = (natoms*(t + 1))/nth - offset;
2699             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2700         }
2701         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2702     }
2703 }
2704
2705 // TODO This can be cleaned up a lot, and move back to runner.cpp
2706 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2707                 const t_inputrec *inputrec,
2708                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2709                 gmx_walltime_accounting_t walltime_accounting,
2710                 nonbonded_verlet_t *nbv,
2711                 gmx_pme_t *pme,
2712                 gmx_bool bWriteStat)
2713 {
2714     t_nrnb *nrnb_tot = nullptr;
2715     double  delta_t  = 0;
2716     double  nbfs     = 0, mflop = 0;
2717     double  elapsed_time,
2718             elapsed_time_over_all_ranks,
2719             elapsed_time_over_all_threads,
2720             elapsed_time_over_all_threads_over_all_ranks;
2721     /* Control whether it is valid to print a report. Only the
2722        simulation master may print, but it should not do so if the run
2723        terminated e.g. before a scheduled reset step. This is
2724        complicated by the fact that PME ranks are unaware of the
2725        reason why they were sent a pmerecvqxFINISH. To avoid
2726        communication deadlocks, we always do the communication for the
2727        report, even if we've decided not to write the report, because
2728        how long it takes to finish the run is not important when we've
2729        decided not to report on the simulation performance.
2730
2731        Further, we only report performance for dynamical integrators,
2732        because those are the only ones for which we plan to
2733        consider doing any optimizations. */
2734     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2735
2736     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2737     {
2738         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2739         printReport = false;
2740     }
2741
2742     if (cr->nnodes > 1)
2743     {
2744         snew(nrnb_tot, 1);
2745 #if GMX_MPI
2746         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2747                       cr->mpi_comm_mysim);
2748 #endif
2749     }
2750     else
2751     {
2752         nrnb_tot = nrnb;
2753     }
2754
2755     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2756     elapsed_time_over_all_ranks                  = elapsed_time;
2757     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2758     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2759 #if GMX_MPI
2760     if (cr->nnodes > 1)
2761     {
2762         /* reduce elapsed_time over all MPI ranks in the current simulation */
2763         MPI_Allreduce(&elapsed_time,
2764                       &elapsed_time_over_all_ranks,
2765                       1, MPI_DOUBLE, MPI_SUM,
2766                       cr->mpi_comm_mysim);
2767         elapsed_time_over_all_ranks /= cr->nnodes;
2768         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2769          * current simulation. */
2770         MPI_Allreduce(&elapsed_time_over_all_threads,
2771                       &elapsed_time_over_all_threads_over_all_ranks,
2772                       1, MPI_DOUBLE, MPI_SUM,
2773                       cr->mpi_comm_mysim);
2774     }
2775 #endif
2776
2777     if (printReport)
2778     {
2779         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2780     }
2781     if (cr->nnodes > 1)
2782     {
2783         sfree(nrnb_tot);
2784     }
2785
2786     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2787     {
2788         print_dd_statistics(cr, inputrec, fplog);
2789     }
2790
2791     /* TODO Move the responsibility for any scaling by thread counts
2792      * to the code that handled the thread region, so that there's a
2793      * mechanism to keep cycle counting working during the transition
2794      * to task parallelism. */
2795     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2796     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2797     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2798     auto cycle_sum(wallcycle_sum(cr, wcycle));
2799
2800     if (printReport)
2801     {
2802         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2803         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2804         if (pme_gpu_task_enabled(pme))
2805         {
2806             pme_gpu_get_timings(pme, &pme_gpu_timings);
2807         }
2808         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2809                         elapsed_time_over_all_ranks,
2810                         wcycle, cycle_sum,
2811                         nbnxn_gpu_timings,
2812                         &pme_gpu_timings);
2813
2814         if (EI_DYNAMICS(inputrec->eI))
2815         {
2816             delta_t = inputrec->delta_t;
2817         }
2818
2819         if (fplog)
2820         {
2821             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2822                        elapsed_time_over_all_ranks,
2823                        walltime_accounting_get_nsteps_done(walltime_accounting),
2824                        delta_t, nbfs, mflop);
2825         }
2826         if (bWriteStat)
2827         {
2828             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2829                        elapsed_time_over_all_ranks,
2830                        walltime_accounting_get_nsteps_done(walltime_accounting),
2831                        delta_t, nbfs, mflop);
2832         }
2833     }
2834 }
2835
2836 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2837 {
2838     /* this function works, but could probably use a logic rewrite to keep all the different
2839        types of efep straight. */
2840
2841     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2842     {
2843         return;
2844     }
2845
2846     t_lambda *fep = ir->fepvals;
2847     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2848                                             if checkpoint is set -- a kludge is in for now
2849                                             to prevent this.*/
2850
2851     for (int i = 0; i < efptNR; i++)
2852     {
2853         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2854         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2855         {
2856             lambda[i] = fep->init_lambda;
2857             if (lam0)
2858             {
2859                 lam0[i] = lambda[i];
2860             }
2861         }
2862         else
2863         {
2864             lambda[i] = fep->all_lambda[i][*fep_state];
2865             if (lam0)
2866             {
2867                 lam0[i] = lambda[i];
2868             }
2869         }
2870     }
2871     if (ir->bSimTemp)
2872     {
2873         /* need to rescale control temperatures to match current state */
2874         for (int i = 0; i < ir->opts.ngtc; i++)
2875         {
2876             if (ir->opts.ref_t[i] > 0)
2877             {
2878                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2879             }
2880         }
2881     }
2882
2883     /* Send to the log the information on the current lambdas */
2884     if (fplog != nullptr)
2885     {
2886         fprintf(fplog, "Initial vector of lambda components:[ ");
2887         for (const auto &l : lambda)
2888         {
2889             fprintf(fplog, "%10.4f ", l);
2890         }
2891         fprintf(fplog, "]\n");
2892     }
2893 }
2894
2895
2896 void init_md(FILE *fplog,
2897              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2898              t_inputrec *ir, const gmx_output_env_t *oenv,
2899              const MdrunOptions &mdrunOptions,
2900              double *t, double *t0,
2901              t_state *globalState, double *lam0,
2902              t_nrnb *nrnb, gmx_mtop_t *mtop,
2903              gmx_update_t **upd,
2904              gmx::BoxDeformation *deform,
2905              int nfile, const t_filenm fnm[],
2906              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2907              tensor force_vir, tensor shake_vir, rvec mu_tot,
2908              gmx_bool *bSimAnn, t_vcm **vcm,
2909              gmx_wallcycle_t wcycle)
2910 {
2911     int  i;
2912
2913     /* Initial values */
2914     *t = *t0       = ir->init_t;
2915
2916     *bSimAnn = FALSE;
2917     for (i = 0; i < ir->opts.ngtc; i++)
2918     {
2919         /* set bSimAnn if any group is being annealed */
2920         if (ir->opts.annealing[i] != eannNO)
2921         {
2922             *bSimAnn = TRUE;
2923         }
2924     }
2925
2926     /* Initialize lambda variables */
2927     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2928      * We currently need to call initialize_lambdas on non-master ranks
2929      * to initialize lam0.
2930      */
2931     if (MASTER(cr))
2932     {
2933         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2934     }
2935     else
2936     {
2937         int                      tmpFepState;
2938         std::array<real, efptNR> tmpLambda;
2939         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2940     }
2941
2942     // TODO upd is never NULL in practice, but the analysers don't know that
2943     if (upd)
2944     {
2945         *upd = init_update(ir, deform);
2946     }
2947     if (*bSimAnn)
2948     {
2949         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2950     }
2951
2952     if (vcm != nullptr)
2953     {
2954         *vcm = init_vcm(fplog, &mtop->groups, ir);
2955     }
2956
2957     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2958     {
2959         if (ir->etc == etcBERENDSEN)
2960         {
2961             please_cite(fplog, "Berendsen84a");
2962         }
2963         if (ir->etc == etcVRESCALE)
2964         {
2965             please_cite(fplog, "Bussi2007a");
2966         }
2967         if (ir->eI == eiSD1)
2968         {
2969             please_cite(fplog, "Goga2012");
2970         }
2971     }
2972     init_nrnb(nrnb);
2973
2974     if (nfile != -1)
2975     {
2976         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2977
2978         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2979                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2980     }
2981
2982     /* Initiate variables */
2983     clear_mat(force_vir);
2984     clear_mat(shake_vir);
2985     clear_rvec(mu_tot);
2986 }