src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <cmath>
  44 #include <cstdint>
  45 #include <cstdio>
  46 #include <cstring>
  47
  48 #include <array>
  49
  50 #include "gromacs/awh/awh.h"
  51 #include "gromacs/domdec/dlbtiming.h"
  52 #include "gromacs/domdec/domdec_struct.h"
  53 #include "gromacs/domdec/partition.h"
  54 #include "gromacs/essentialdynamics/edsam.h"
  55 #include "gromacs/ewald/pme.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/imd/imd.h"
  64 #include "gromacs/listed_forces/bonded.h"
  65 #include "gromacs/listed_forces/disre.h"
  66 #include "gromacs/listed_forces/gpubonded.h"
  67 #include "gromacs/listed_forces/manage_threading.h"
  68 #include "gromacs/listed_forces/orires.h"
  69 #include "gromacs/math/arrayrefwithpadding.h"
  70 #include "gromacs/math/functions.h"
  71 #include "gromacs/math/units.h"
  72 #include "gromacs/math/vec.h"
  73 #include "gromacs/math/vecdump.h"
  74 #include "gromacs/mdlib/calcmu.h"
  75 #include "gromacs/mdlib/calcvir.h"
  76 #include "gromacs/mdlib/constr.h"
  77 #include "gromacs/mdlib/force.h"
  78 #include "gromacs/mdlib/forcerec.h"
  79 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  80 #include "gromacs/mdlib/mdrun.h"
  81 #include "gromacs/mdlib/nb_verlet.h"
  82 #include "gromacs/mdlib/nbnxn_atomdata.h"
  83 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  84 #include "gromacs/mdlib/nbnxn_grid.h"
  85 #include "gromacs/mdlib/nbnxn_search.h"
  86 #include "gromacs/mdlib/ppforceworkload.h"
  87 #include "gromacs/mdlib/qmmm.h"
  88 #include "gromacs/mdlib/update.h"
  89 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  90 #include "gromacs/mdtypes/commrec.h"
  91 #include "gromacs/mdtypes/enerdata.h"
  92 #include "gromacs/mdtypes/forceoutput.h"
  93 #include "gromacs/mdtypes/iforceprovider.h"
  94 #include "gromacs/mdtypes/inputrec.h"
  95 #include "gromacs/mdtypes/md_enums.h"
  96 #include "gromacs/mdtypes/state.h"
  97 #include "gromacs/pbcutil/ishift.h"
  98 #include "gromacs/pbcutil/mshift.h"
  99 #include "gromacs/pbcutil/pbc.h"
 100 #include "gromacs/pulling/pull.h"
 101 #include "gromacs/pulling/pull_rotation.h"
 102 #include "gromacs/timing/cyclecounter.h"
 103 #include "gromacs/timing/gpu_timing.h"
 104 #include "gromacs/timing/wallcycle.h"
 105 #include "gromacs/timing/wallcyclereporting.h"
 106 #include "gromacs/timing/walltime_accounting.h"
 107 #include "gromacs/topology/topology.h"
 108 #include "gromacs/utility/arrayref.h"
 109 #include "gromacs/utility/basedefinitions.h"
 110 #include "gromacs/utility/cstringutil.h"
 111 #include "gromacs/utility/exceptions.h"
 112 #include "gromacs/utility/fatalerror.h"
 113 #include "gromacs/utility/gmxassert.h"
 114 #include "gromacs/utility/gmxmpi.h"
 115 #include "gromacs/utility/logger.h"
 116 #include "gromacs/utility/smalloc.h"
 117 #include "gromacs/utility/strconvert.h"
 118 #include "gromacs/utility/sysinfo.h"
 119
 120 #include "nbnxn_gpu.h"
 121 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 122 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 123
 124 // TODO: this environment variable allows us to verify before release
 125 // that on less common architectures the total cost of polling is not larger than
 126 // a blocking wait (so polling does not introduce overhead when the static
 127 // PME-first ordering would suffice).
 128 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 129
 130
 131 void print_time(FILE                     *out,
 132                 gmx_walltime_accounting_t walltime_accounting,
 133                 int64_t                   step,
 134                 const t_inputrec         *ir,
 135                 const t_commrec          *cr)
 136 {
 137     time_t finish;
 138     double dt, elapsed_seconds, time_per_step;
 139
 140 #if !GMX_THREAD_MPI
 141     if (!PAR(cr))
 142 #endif
 143     {
 144         fprintf(out, "\r");
 145     }
 146     fputs("step ", out);
 147     fputs(gmx::int64ToString(step).c_str(), out);
 148     fflush(out);
 149
 150     if ((step >= ir->nstlist))
 151     {
 152         double seconds_since_epoch = gmx_gettime();
 153         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 154         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 155         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 156
 157         if (ir->nsteps >= 0)
 158         {
 159             if (dt >= 300)
 160             {
 161                 finish = static_cast<time_t>(seconds_since_epoch + dt);
 162                 auto timebuf = gmx_ctime_r(&finish);
 163                 timebuf.erase(timebuf.find_first_of('\n'));
 164                 fputs(", will finish ", out);
 165                 fputs(timebuf.c_str(), out);
 166             }
 167             else
 168             {
 169                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
 170             }
 171         }
 172         else
 173         {
 174             fprintf(out, " performance: %.1f ns/day    ",
 175                     ir->delta_t/1000*24*60*60/time_per_step);
 176         }
 177     }
 178 #if !GMX_THREAD_MPI
 179     if (PAR(cr))
 180     {
 181         fprintf(out, "\n");
 182     }
 183 #else
 184     GMX_UNUSED_VALUE(cr);
 185 #endif
 186
 187     fflush(out);
 188 }
 189
 190 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 191                          double the_time)
 192 {
 193     if (!fplog)
 194     {
 195         return;
 196     }
 197
 198     time_t temp_time = static_cast<time_t>(the_time);
 199
 200     auto   timebuf = gmx_ctime_r(&temp_time);
 201
 202     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
 203 }
 204
 205 void print_start(FILE *fplog, const t_commrec *cr,
 206                  gmx_walltime_accounting_t walltime_accounting,
 207                  const char *name)
 208 {
 209     char buf[STRLEN];
 210
 211     sprintf(buf, "Started %s", name);
 212     print_date_and_time(fplog, cr->nodeid, buf,
 213                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 214 }
 215
 216 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 217 {
 218     const int      end = forceToAdd.size();
 219
 220     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 221 #pragma omp parallel for num_threads(nt) schedule(static)
 222     for (int i = 0; i < end; i++)
 223     {
 224         rvec_inc(f[i], forceToAdd[i]);
 225     }
 226 }
 227
 228 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
 229                         tensor vir_part, const t_graph *graph, const matrix box,
 230                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 231 {
 232     /* The short-range virial from surrounding boxes */
 233     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 234     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 235
 236     /* Calculate partial virial, for local atoms only, based on short range.
 237      * Total virial is computed in global_stat, called from do_md
 238      */
 239     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 240     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 241
 242     if (debug)
 243     {
 244         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 245     }
 246 }
 247
 248 static void pull_potential_wrapper(const t_commrec *cr,
 249                                    const t_inputrec *ir,
 250                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
 251                                    gmx::ForceWithVirial *force,
 252                                    const t_mdatoms *mdatoms,
 253                                    gmx_enerdata_t *enerd,
 254                                    const real *lambda,
 255                                    double t,
 256                                    gmx_wallcycle_t wcycle)
 257 {
 258     t_pbc  pbc;
 259     real   dvdl;
 260
 261     /* Calculate the center of mass forces, this requires communication,
 262      * which is why pull_potential is called close to other communication.
 263      */
 264     wallcycle_start(wcycle, ewcPULLPOT);
 265     set_pbc(&pbc, ir->ePBC, box);
 266     dvdl                     = 0;
 267     enerd->term[F_COM_PULL] +=
 268         pull_potential(ir->pull_work, mdatoms, &pbc,
 269                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 270     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 271     wallcycle_stop(wcycle, ewcPULLPOT);
 272 }
 273
 274 static void pme_receive_force_ener(const t_commrec      *cr,
 275                                    gmx::ForceWithVirial *forceWithVirial,
 276                                    gmx_enerdata_t       *enerd,
 277                                    gmx_wallcycle_t       wcycle)
 278 {
 279     real   e_q, e_lj, dvdl_q, dvdl_lj;
 280     float  cycles_ppdpme, cycles_seppme;
 281
 282     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 283     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 284
 285     /* In case of node-splitting, the PP nodes receive the long-range
 286      * forces, virial and energy from the PME nodes here.
 287      */
 288     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 289     dvdl_q  = 0;
 290     dvdl_lj = 0;
 291     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 292                       &cycles_seppme);
 293     enerd->term[F_COUL_RECIP] += e_q;
 294     enerd->term[F_LJ_RECIP]   += e_lj;
 295     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 296     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 297
 298     if (wcycle)
 299     {
 300         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 301     }
 302     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 303 }
 304
 305 static void print_large_forces(FILE            *fp,
 306                                const t_mdatoms *md,
 307                                const t_commrec *cr,
 308                                int64_t          step,
 309                                real             forceTolerance,
 310                                const rvec      *x,
 311                                const rvec      *f)
 312 {
 313     real           force2Tolerance = gmx::square(forceTolerance);
 314     gmx::index     numNonFinite    = 0;
 315     for (int i = 0; i < md->homenr; i++)
 316     {
 317         real force2    = norm2(f[i]);
 318         bool nonFinite = !std::isfinite(force2);
 319         if (force2 >= force2Tolerance || nonFinite)
 320         {
 321             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 322                     step,
 323                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 324         }
 325         if (nonFinite)
 326         {
 327             numNonFinite++;
 328         }
 329     }
 330     if (numNonFinite > 0)
 331     {
 332         /* Note that with MPI this fatal call on one rank might interrupt
 333          * the printing on other ranks. But we can only avoid that with
 334          * an expensive MPI barrier that we would need at each step.
 335          */
 336         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
 337     }
 338 }
 339
 340 static void post_process_forces(const t_commrec           *cr,
 341                                 int64_t                    step,
 342                                 t_nrnb                    *nrnb,
 343                                 gmx_wallcycle_t            wcycle,
 344                                 const gmx_localtop_t      *top,
 345                                 const matrix               box,
 346                                 const rvec                 x[],
 347                                 rvec                       f[],
 348                                 gmx::ForceWithVirial      *forceWithVirial,
 349                                 tensor                     vir_force,
 350                                 const t_mdatoms           *mdatoms,
 351                                 const t_graph             *graph,
 352                                 const t_forcerec          *fr,
 353                                 const gmx_vsite_t         *vsite,
 354                                 int                        flags)
 355 {
 356     if (fr->haveDirectVirialContributions)
 357     {
 358         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 359
 360         if (vsite)
 361         {
 362             /* Spread the mesh force on virtual sites to the other particles...
 363              * This is parallellized. MPI communication is performed
 364              * if the constructing atoms aren't local.
 365              */
 366             matrix virial = { { 0 } };
 367             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 368                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
 369                            nrnb,
 370                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 371             forceWithVirial->addVirialContribution(virial);
 372         }
 373
 374         if (flags & GMX_FORCE_VIRIAL)
 375         {
 376             /* Now add the forces, this is local */
 377             sum_forces(f, forceWithVirial->force_);
 378
 379             /* Add the direct virial contributions */
 380             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 381             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 382
 383             if (debug)
 384             {
 385                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 386             }
 387         }
 388     }
 389
 390     if (fr->print_force >= 0)
 391     {
 392         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 393     }
 394 }
 395
 396 static void do_nb_verlet(const t_forcerec *fr,
 397                          const interaction_const_t *ic,
 398                          gmx_enerdata_t *enerd,
 399                          int flags, int ilocality,
 400                          int clearF,
 401                          int64_t step,
 402                          t_nrnb *nrnb,
 403                          gmx_wallcycle_t wcycle)
 404 {
 405     if (!(flags & GMX_FORCE_NONBONDED))
 406     {
 407         /* skip non-bonded calculation */
 408         return;
 409     }
 410
 411     nonbonded_verlet_t       *nbv  = fr->nbv;
 412     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 413
 414     /* GPU kernel launch overhead is already timed separately */
 415     if (fr->cutoff_scheme != ecutsVERLET)
 416     {
 417         gmx_incons("Invalid cut-off scheme passed!");
 418     }
 419
 420     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 421
 422     if (!bUsingGpuKernels)
 423     {
 424         /* When dynamic pair-list  pruning is requested, we need to prune
 425          * at nstlistPrune steps.
 426          */
 427         if (nbv->listParams->useDynamicPruning &&
 428             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 429         {
 430             /* Prune the pair-list beyond fr->ic->rlistPrune using
 431              * the current coordinates of the atoms.
 432              */
 433             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 434             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 435             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 436         }
 437
 438         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 439     }
 440
 441     switch (nbvg->kernel_type)
 442     {
 443         case nbnxnk4x4_PlainC:
 444         case nbnxnk4xN_SIMD_4xN:
 445         case nbnxnk4xN_SIMD_2xNN:
 446             nbnxn_kernel_cpu(nbvg,
 447                              nbv->nbat,
 448                              ic,
 449                              fr->shift_vec,
 450                              flags,
 451                              clearF,
 452                              fr->fshift[0],
 453                              enerd->grpp.ener[egCOULSR],
 454                              fr->bBHAM ?
 455                              enerd->grpp.ener[egBHAMSR] :
 456                              enerd->grpp.ener[egLJSR]);
 457             break;
 458
 459         case nbnxnk8x8x8_GPU:
 460             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
 461             break;
 462
 463         case nbnxnk8x8x8_PlainC:
 464             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nblGpu[0],
 465                                  nbv->nbat, ic,
 466                                  fr->shift_vec,
 467                                  flags,
 468                                  clearF,
 469                                  nbv->nbat->out[0].f,
 470                                  fr->fshift[0],
 471                                  enerd->grpp.ener[egCOULSR],
 472                                  fr->bBHAM ?
 473                                  enerd->grpp.ener[egBHAMSR] :
 474                                  enerd->grpp.ener[egLJSR]);
 475             break;
 476
 477         default:
 478             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 479
 480     }
 481     if (!bUsingGpuKernels)
 482     {
 483         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 484     }
 485
 486     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 487     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 488     {
 489         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 490     }
 491     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 492              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 493     {
 494         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 495     }
 496     else
 497     {
 498         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 499     }
 500     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 501     if (flags & GMX_FORCE_ENERGY)
 502     {
 503         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 504         enr_nbnxn_kernel_ljc += 1;
 505         enr_nbnxn_kernel_lj  += 1;
 506     }
 507
 508     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 509              nbvg->nbl_lists.natpair_ljq);
 510     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 511              nbvg->nbl_lists.natpair_lj);
 512     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 513     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 514              nbvg->nbl_lists.natpair_q);
 515
 516     if (ic->vdw_modifier == eintmodFORCESWITCH)
 517     {
 518         /* We add up the switch cost separately */
 519         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 520                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 521     }
 522     if (ic->vdw_modifier == eintmodPOTSWITCH)
 523     {
 524         /* We add up the switch cost separately */
 525         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 526                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 527     }
 528     if (ic->vdwtype == evdwPME)
 529     {
 530         /* We add up the LJ Ewald cost separately */
 531         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 532                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 533     }
 534 }
 535
 536 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 537                              t_forcerec           *fr,
 538                              rvec                  x[],
 539                              rvec                  f[],
 540                              const t_mdatoms      *mdatoms,
 541                              t_lambda             *fepvals,
 542                              real                 *lambda,
 543                              gmx_enerdata_t       *enerd,
 544                              int                   flags,
 545                              t_nrnb               *nrnb,
 546                              gmx_wallcycle_t       wcycle)
 547 {
 548     int              donb_flags;
 549     nb_kernel_data_t kernel_data;
 550     real             lam_i[efptNR];
 551     real             dvdl_nb[efptNR];
 552     int              th;
 553     int              i, j;
 554
 555     donb_flags = 0;
 556     /* Add short-range interactions */
 557     donb_flags |= GMX_NONBONDED_DO_SR;
 558
 559     /* Currently all group scheme kernels always calculate (shift-)forces */
 560     if (flags & GMX_FORCE_FORCES)
 561     {
 562         donb_flags |= GMX_NONBONDED_DO_FORCE;
 563     }
 564     if (flags & GMX_FORCE_VIRIAL)
 565     {
 566         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 567     }
 568     if (flags & GMX_FORCE_ENERGY)
 569     {
 570         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 571     }
 572
 573     kernel_data.flags  = donb_flags;
 574     kernel_data.lambda = lambda;
 575     kernel_data.dvdl   = dvdl_nb;
 576
 577     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 578     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 579
 580     /* reset free energy components */
 581     for (i = 0; i < efptNR; i++)
 582     {
 583         dvdl_nb[i]  = 0;
 584     }
 585
 586     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 587
 588     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 589 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 590     for (th = 0; th < nbl_lists->nnbl; th++)
 591     {
 592         try
 593         {
 594             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 595                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 596         }
 597         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 598     }
 599
 600     if (fepvals->sc_alpha != 0)
 601     {
 602         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 603         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 604     }
 605     else
 606     {
 607         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 608         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 609     }
 610
 611     /* If we do foreign lambda and we have soft-core interactions
 612      * we have to recalculate the (non-linear) energies contributions.
 613      */
 614     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 615     {
 616         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 617         kernel_data.lambda         = lam_i;
 618         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 619         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 620         /* Note that we add to kernel_data.dvdl, but ignore the result */
 621
 622         for (i = 0; i < enerd->n_lambda; i++)
 623         {
 624             for (j = 0; j < efptNR; j++)
 625             {
 626                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 627             }
 628             reset_foreign_enerdata(enerd);
 629 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 630             for (th = 0; th < nbl_lists->nnbl; th++)
 631             {
 632                 try
 633                 {
 634                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 635                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 636                 }
 637                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 638             }
 639
 640             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 641             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 642         }
 643     }
 644
 645     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 646 }
 647
 648 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 649 {
 650     return nbv != nullptr && nbv->bUseGPU;
 651 }
 652
 653 static inline void clear_rvecs_omp(int n, rvec v[])
 654 {
 655     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 656
 657     /* Note that we would like to avoid this conditional by putting it
 658      * into the omp pragma instead, but then we still take the full
 659      * omp parallel for overhead (at least with gcc5).
 660      */
 661     if (nth == 1)
 662     {
 663         for (int i = 0; i < n; i++)
 664         {
 665             clear_rvec(v[i]);
 666         }
 667     }
 668     else
 669     {
 670 #pragma omp parallel for num_threads(nth) schedule(static)
 671         for (int i = 0; i < n; i++)
 672         {
 673             clear_rvec(v[i]);
 674         }
 675     }
 676 }
 677
 678 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 679  *
 680  * \param groupOptions  Group options, containing T-coupling options
 681  */
 682 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 683 {
 684     real nrdfCoupled   = 0;
 685     real nrdfUncoupled = 0;
 686     real kineticEnergy = 0;
 687     for (int g = 0; g < groupOptions.ngtc; g++)
 688     {
 689         if (groupOptions.tau_t[g] >= 0)
 690         {
 691             nrdfCoupled   += groupOptions.nrdf[g];
 692             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 693         }
 694         else
 695         {
 696             nrdfUncoupled += groupOptions.nrdf[g];
 697         }
 698     }
 699
 700     /* This conditional with > also catches nrdf=0 */
 701     if (nrdfCoupled > nrdfUncoupled)
 702     {
 703         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 704     }
 705     else
 706     {
 707         return 0;
 708     }
 709 }
 710
 711 /*! \brief This routine checks that the potential energy is finite.
 712  *
 713  * Always checks that the potential energy is finite. If step equals
 714  * inputrec.init_step also checks that the magnitude of the potential energy
 715  * is reasonable. Terminates with a fatal error when a check fails.
 716  * Note that passing this check does not guarantee finite forces,
 717  * since those use slightly different arithmetics. But in most cases
 718  * there is just a narrow coordinate range where forces are not finite
 719  * and energies are finite.
 720  *
 721  * \param[in] step      The step number, used for checking and printing
 722  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 723  * \param[in] inputrec  The input record
 724  */
 725 static void checkPotentialEnergyValidity(int64_t               step,
 726                                          const gmx_enerdata_t &enerd,
 727                                          const t_inputrec     &inputrec)
 728 {
 729     /* Threshold valid for comparing absolute potential energy against
 730      * the kinetic energy. Normally one should not consider absolute
 731      * potential energy values, but with a factor of one million
 732      * we should never get false positives.
 733      */
 734     constexpr real c_thresholdFactor = 1e6;
 735
 736     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 737     real           averageKineticEnergy = 0;
 738     /* We only check for large potential energy at the initial step,
 739      * because that is by far the most likely step for this too occur
 740      * and because computing the average kinetic energy is not free.
 741      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 742      * before they become NaN.
 743      */
 744     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 745     {
 746         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 747     }
 748
 749     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 750                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 751     {
 752         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 753                   step,
 754                   enerd.term[F_EPOT],
 755                   energyIsNotFinite ? "not finite" : "extremely high",
 756                   enerd.term[F_LJ],
 757                   enerd.term[F_COUL_SR],
 758                   energyIsNotFinite ? "non-finite" : "very high",
 759                   energyIsNotFinite ? " or Nan" : "");
 760     }
 761 }
 762
 763 /*! \brief Compute forces and/or energies for special algorithms
 764  *
 765  * The intention is to collect all calls to algorithms that compute
 766  * forces on local atoms only and that do not contribute to the local
 767  * virial sum (but add their virial contribution separately).
 768  * Eventually these should likely all become ForceProviders.
 769  * Within this function the intention is to have algorithms that do
 770  * global communication at the end, so global barriers within the MD loop
 771  * are as close together as possible.
 772  *
 773  * \param[in]     fplog            The log file
 774  * \param[in]     cr               The communication record
 775  * \param[in]     inputrec         The input record
 776  * \param[in]     awh              The Awh module (nullptr if none in use).
 777  * \param[in]     enforcedRotation Enforced rotation module.
 778  * \param[in]     step             The current MD step
 779  * \param[in]     t                The current time
 780  * \param[in,out] wcycle           Wallcycle accounting struct
 781  * \param[in,out] forceProviders   Pointer to a list of force providers
 782  * \param[in]     box              The unit cell
 783  * \param[in]     x                The coordinates
 784  * \param[in]     mdatoms          Per atom properties
 785  * \param[in]     lambda           Array of free-energy lambda values
 786  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 787  * \param[in,out] forceWithVirial  Force and virial buffers
 788  * \param[in,out] enerd            Energy buffer
 789  * \param[in,out] ed               Essential dynamics pointer
 790  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 791  *
 792  * \todo Remove bNS, which is used incorrectly.
 793  * \todo Convert all other algorithms called here to ForceProviders.
 794  */
 795 static void
 796 computeSpecialForces(FILE                          *fplog,
 797                      const t_commrec               *cr,
 798                      const t_inputrec              *inputrec,
 799                      gmx::Awh                      *awh,
 800                      gmx_enfrot                    *enforcedRotation,
 801                      int64_t                        step,
 802                      double                         t,
 803                      gmx_wallcycle_t                wcycle,
 804                      ForceProviders                *forceProviders,
 805                      matrix                         box,
 806                      gmx::ArrayRef<const gmx::RVec> x,
 807                      const t_mdatoms               *mdatoms,
 808                      real                          *lambda,
 809                      int                            forceFlags,
 810                      gmx::ForceWithVirial          *forceWithVirial,
 811                      gmx_enerdata_t                *enerd,
 812                      gmx_edsam                     *ed,
 813                      gmx_bool                       bNS)
 814 {
 815     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
 816
 817     /* NOTE: Currently all ForceProviders only provide forces.
 818      *       When they also provide energies, remove this conditional.
 819      */
 820     if (computeForces)
 821     {
 822         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 823         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 824
 825         /* Collect forces from modules */
 826         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 827     }
 828
 829     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 830     {
 831         pull_potential_wrapper(cr, inputrec, box, x,
 832                                forceWithVirial,
 833                                mdatoms, enerd, lambda, t,
 834                                wcycle);
 835
 836         if (awh)
 837         {
 838             enerd->term[F_COM_PULL] +=
 839                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 840                                                   forceWithVirial,
 841                                                   t, step, wcycle, fplog);
 842         }
 843     }
 844
 845     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 846
 847     /* Add the forces from enforced rotation potentials (if any) */
 848     if (inputrec->bRot)
 849     {
 850         wallcycle_start(wcycle, ewcROTadd);
 851         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
 852         wallcycle_stop(wcycle, ewcROTadd);
 853     }
 854
 855     if (ed)
 856     {
 857         /* Note that since init_edsam() is called after the initialization
 858          * of forcerec, edsam doesn't request the noVirSum force buffer.
 859          * Thus if no other algorithm (e.g. PME) requires it, the forces
 860          * here will contribute to the virial.
 861          */
 862         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 863     }
 864
 865     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 866     if (inputrec->bIMD && computeForces)
 867     {
 868         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 869     }
 870 }
 871
 872 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 873  *
 874  * \param[in]  pmedata       The PME structure
 875  * \param[in]  box           The box matrix
 876  * \param[in]  x             Coordinate array
 877  * \param[in]  flags         Force flags
 878  * \param[in]  pmeFlags      PME flags
 879  * \param[in]  wcycle        The wallcycle structure
 880  */
 881 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 882                                       matrix          box,
 883                                       rvec            x[],
 884                                       int             flags,
 885                                       int             pmeFlags,
 886                                       gmx_wallcycle_t wcycle)
 887 {
 888     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
 889     pme_gpu_launch_spread(pmedata, x, wcycle);
 890 }
 891
 892 /*! \brief Launch the FFT and gather stages of PME GPU
 893  *
 894  * This function only implements setting the output forces (no accumulation).
 895  *
 896  * \param[in]  pmedata        The PME structure
 897  * \param[in]  wcycle         The wallcycle structure
 898  */
 899 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 900                                      gmx_wallcycle_t   wcycle)
 901 {
 902     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 903     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 904 }
 905
 906 /*! \brief
 907  *  Polling wait for either of the PME or nonbonded GPU tasks.
 908  *
 909  * Instead of a static order in waiting for GPU tasks, this function
 910  * polls checking which of the two tasks completes first, and does the
 911  * associated force buffer reduction overlapped with the other task.
 912  * By doing that, unlike static scheduling order, it can always overlap
 913  * one of the reductions, regardless of the GPU task completion order.
 914  *
 915  * \param[in]     nbv              Nonbonded verlet structure
 916  * \param[in,out] pmedata          PME module data
 917  * \param[in,out] force            Force array to reduce task outputs into.
 918  * \param[in,out] forceWithVirial  Force and virial buffers
 919  * \param[in,out] fshift           Shift force output vector results are reduced into
 920  * \param[in,out] enerd            Energy data structure results are reduced into
 921  * \param[in]     flags            Force flags
 922  * \param[in]     pmeFlags         PME flags
 923  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
 924  * \param[in]     wcycle           The wallcycle structure
 925  */
 926 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
 927                                         gmx_pme_t                           *pmedata,
 928                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
 929                                         gmx::ForceWithVirial                *forceWithVirial,
 930                                         rvec                                 fshift[],
 931                                         gmx_enerdata_t                      *enerd,
 932                                         int                                  flags,
 933                                         int                                  pmeFlags,
 934                                         bool                                 haveOtherWork,
 935                                         gmx_wallcycle_t                      wcycle)
 936 {
 937     bool isPmeGpuDone = false;
 938     bool isNbGpuDone  = false;
 939
 940
 941     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 942
 943     while (!isPmeGpuDone || !isNbGpuDone)
 944     {
 945         if (!isPmeGpuDone)
 946         {
 947             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 948             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
 949         }
 950
 951         if (!isNbGpuDone)
 952         {
 953             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 954             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 955             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 956                                                     flags, eatLocal,
 957                                                     haveOtherWork,
 958                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 959                                                     fshift, completionType);
 960             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 961             // To get the call count right, when the task finished we
 962             // issue a start/stop.
 963             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 964             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 965             if (isNbGpuDone)
 966             {
 967                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 968                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 969
 970                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
 971                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
 972             }
 973         }
 974     }
 975 }
 976
 977 /*! \brief
 978  *  Launch the dynamic rolling pruning GPU task.
 979  *
 980  *  We currently alternate local/non-local list pruning in odd-even steps
 981  *  (only pruning every second step without DD).
 982  *
 983  * \param[in]     cr               The communication record
 984  * \param[in]     nbv              Nonbonded verlet structure
 985  * \param[in]     inputrec         The input record
 986  * \param[in]     step             The current MD step
 987  */
 988 static inline void launchGpuRollingPruning(const t_commrec          *cr,
 989                                            const nonbonded_verlet_t *nbv,
 990                                            const t_inputrec         *inputrec,
 991                                            const int64_t             step)
 992 {
 993     /* We should not launch the rolling pruning kernel at a search
 994      * step or just before search steps, since that's useless.
 995      * Without domain decomposition we prune at even steps.
 996      * With domain decomposition we alternate local and non-local
 997      * pruning at even and odd steps.
 998      */
 999     int  numRollingParts     = nbv->listParams->numRollingParts;
1000     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1001     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1002     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1003     if (stepWithCurrentList > 0 &&
1004         stepWithCurrentList < inputrec->nstlist - 1 &&
1005         (stepIsEven || DOMAINDECOMP(cr)))
1006     {
1007         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1008                                           stepIsEven ? eintLocal : eintNonlocal,
1009                                           numRollingParts);
1010     }
1011 }
1012
1013 static void do_force_cutsVERLET(FILE *fplog,
1014                                 const t_commrec *cr,
1015                                 const gmx_multisim_t *ms,
1016                                 const t_inputrec *inputrec,
1017                                 gmx::Awh *awh,
1018                                 gmx_enfrot *enforcedRotation,
1019                                 int64_t step,
1020                                 t_nrnb *nrnb,
1021                                 gmx_wallcycle_t wcycle,
1022                                 const gmx_localtop_t *top,
1023                                 const gmx_groups_t * /* groups */,
1024                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1025                                 history_t *hist,
1026                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
1027                                 tensor vir_force,
1028                                 const t_mdatoms *mdatoms,
1029                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1030                                 real *lambda,
1031                                 t_graph *graph,
1032                                 t_forcerec *fr,
1033                                 gmx::PpForceWorkload *ppForceWorkload,
1034                                 interaction_const_t *ic,
1035                                 const gmx_vsite_t *vsite,
1036                                 rvec mu_tot,
1037                                 double t,
1038                                 gmx_edsam *ed,
1039                                 const int flags,
1040                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1041                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1042 {
1043     int                 cg1, i, j;
1044     double              mu[2*DIM];
1045     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1046     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1047     rvec                vzero, box_diag;
1048     float               cycles_pme, cycles_wait_gpu;
1049     nonbonded_verlet_t *nbv = fr->nbv;
1050
1051     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1052     bNS           = ((flags & GMX_FORCE_NS) != 0);
1053     bFillGrid     = (bNS && bStateChanged);
1054     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1055     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
1056     bUseGPU       = fr->nbv->bUseGPU;
1057     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1058
1059     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1060     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1061     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1062         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1063     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
1064         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
1065         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
1066         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
1067
1068     /* At a search step we need to start the first balancing region
1069      * somewhere early inside the step after communication during domain
1070      * decomposition (and not during the previous step as usual).
1071      */
1072     if (bNS &&
1073         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1074     {
1075         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1076     }
1077
1078     cycles_wait_gpu = 0;
1079
1080     const int start  = 0;
1081     const int homenr = mdatoms->homenr;
1082
1083     clear_mat(vir_force);
1084
1085     if (DOMAINDECOMP(cr))
1086     {
1087         cg1 = cr->dd->globalAtomGroupIndices.size();
1088     }
1089     else
1090     {
1091         cg1 = top->cgs.nr;
1092     }
1093     if (fr->n_tpi > 0)
1094     {
1095         cg1--;
1096     }
1097
1098     if (bStateChanged)
1099     {
1100         update_forcerec(fr, box);
1101
1102         if (inputrecNeedMutot(inputrec))
1103         {
1104             /* Calculate total (local) dipole moment in a temporary common array.
1105              * This makes it possible to sum them over nodes faster.
1106              */
1107             calc_mu(start, homenr,
1108                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1109                     mu, mu+DIM);
1110         }
1111     }
1112
1113     if (fr->ePBC != epbcNONE)
1114     {
1115         /* Compute shift vectors every step,
1116          * because of pressure coupling or box deformation!
1117          */
1118         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1119         {
1120             calc_shifts(box, fr->shift_vec);
1121         }
1122
1123         if (bCalcCGCM)
1124         {
1125             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1126             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1127         }
1128         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1129         {
1130             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1131         }
1132     }
1133
1134     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1135                                  fr->shift_vec, nbv->nbat);
1136
1137 #if GMX_MPI
1138     if (!thisRankHasDuty(cr, DUTY_PME))
1139     {
1140         /* Send particle coordinates to the pme nodes.
1141          * Since this is only implemented for domain decomposition
1142          * and domain decomposition does not use the graph,
1143          * we do not need to worry about shifting.
1144          */
1145         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1146                                  lambda[efptCOUL], lambda[efptVDW],
1147                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1148                                  step, wcycle);
1149     }
1150 #endif /* GMX_MPI */
1151
1152     if (useGpuPme)
1153     {
1154         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1155     }
1156
1157     /* do gridding for pair search */
1158     if (bNS)
1159     {
1160         if (graph && bStateChanged)
1161         {
1162             /* Calculate intramolecular shift vectors to make molecules whole */
1163             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1164         }
1165
1166         clear_rvec(vzero);
1167         box_diag[XX] = box[XX][XX];
1168         box_diag[YY] = box[YY][YY];
1169         box_diag[ZZ] = box[ZZ][ZZ];
1170
1171         wallcycle_start(wcycle, ewcNS);
1172         if (!DOMAINDECOMP(cr))
1173         {
1174             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1175             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1176                               0, vzero, box_diag,
1177                               nullptr, 0, mdatoms->homenr, -1,
1178                               fr->cginfo, x.unpaddedArrayRef(),
1179                               0, nullptr,
1180                               nbv->grp[eintLocal].kernel_type,
1181                               nbv->nbat);
1182             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1183         }
1184         else
1185         {
1186             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1187             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1188                                        fr->cginfo, x.unpaddedArrayRef(),
1189                                        nbv->grp[eintNonlocal].kernel_type,
1190                                        nbv->nbat);
1191             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1192         }
1193
1194         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1195
1196         wallcycle_stop(wcycle, ewcNS);
1197     }
1198
1199     /* initialize the GPU atom data and copy shift vector */
1200     if (bUseGPU)
1201     {
1202         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1203         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1204
1205         if (bNS)
1206         {
1207             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1208         }
1209
1210         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1211
1212         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1213
1214         if (bNS && fr->gpuBonded)
1215         {
1216             /* Now we put all atoms on the grid, we can assign bonded
1217              * interactions to the GPU, where the grid order is
1218              * needed. Also the xq, f and fshift device buffers have
1219              * been reallocated if needed, so the bonded code can
1220              * learn about them. */
1221             // TODO the xq, f, and fshift buffers are now shared
1222             // resources, so they should be maintained by a
1223             // higher-level object than the nb module.
1224             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1225                                                                   top->idef,
1226                                                                   nbnxn_gpu_get_xq(nbv->gpu_nbv),
1227                                                                   nbnxn_gpu_get_f(nbv->gpu_nbv),
1228                                                                   nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1229             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1230         }
1231
1232         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1233     }
1234
1235     /* do local pair search */
1236     if (bNS)
1237     {
1238         wallcycle_start_nocount(wcycle, ewcNS);
1239         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1240         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1241                             &top->excls,
1242                             nbv->listParams->rlistOuter,
1243                             nbv->min_ci_balanced,
1244                             &nbv->grp[eintLocal].nbl_lists,
1245                             eintLocal,
1246                             nbv->grp[eintLocal].kernel_type,
1247                             nrnb);
1248         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1249         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1250         {
1251             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1252         }
1253         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1254
1255         if (bUseGPU)
1256         {
1257             /* initialize local pair-list on the GPU */
1258             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1259                                     nbv->grp[eintLocal].nbl_lists.nblGpu[0],
1260                                     eintLocal);
1261         }
1262         wallcycle_stop(wcycle, ewcNS);
1263     }
1264     else
1265     {
1266         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1267                                         nbv->nbat, wcycle);
1268     }
1269
1270     if (bUseGPU)
1271     {
1272         if (DOMAINDECOMP(cr))
1273         {
1274             ddOpenBalanceRegionGpu(cr->dd);
1275         }
1276
1277         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1278
1279         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1280         nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
1281         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282
1283         // bonded work not split into separate local and non-local, so with DD
1284         // we can only launch the kernel after non-local coordinates have been received.
1285         if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1286         {
1287             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1288             fr->gpuBonded->launchKernels(fr, flags, box);
1289             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1290         }
1291
1292         /* launch local nonbonded work on GPU */
1293         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1294         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1295                      step, nrnb, wcycle);
1296         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1297         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1298     }
1299
1300     if (useGpuPme)
1301     {
1302         // In PME GPU and mixed mode we launch FFT / gather after the
1303         // X copy/transform to allow overlap as well as after the GPU NB
1304         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1305         // the nonbonded kernel.
1306         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1307     }
1308
1309     /* Communicate coordinates and sum dipole if necessary +
1310        do non-local pair search */
1311     if (DOMAINDECOMP(cr))
1312     {
1313         if (bNS)
1314         {
1315             wallcycle_start_nocount(wcycle, ewcNS);
1316             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1317
1318             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1319                                 &top->excls,
1320                                 nbv->listParams->rlistOuter,
1321                                 nbv->min_ci_balanced,
1322                                 &nbv->grp[eintNonlocal].nbl_lists,
1323                                 eintNonlocal,
1324                                 nbv->grp[eintNonlocal].kernel_type,
1325                                 nrnb);
1326             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1327             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1328             {
1329                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1330             }
1331             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1332
1333             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1334             {
1335                 /* initialize non-local pair-list on the GPU */
1336                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1337                                         nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
1338                                         eintNonlocal);
1339             }
1340             wallcycle_stop(wcycle, ewcNS);
1341         }
1342         else
1343         {
1344             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1345
1346             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1347                                             nbv->nbat, wcycle);
1348         }
1349
1350         if (bUseGPU)
1351         {
1352             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1353
1354             /* launch non-local nonbonded tasks on GPU */
1355             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1356             nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1357             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1358
1359             if (ppForceWorkload->haveGpuBondedWork)
1360             {
1361                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1362                 fr->gpuBonded->launchKernels(fr, flags, box);
1363                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1364             }
1365
1366             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1367             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1368                          step, nrnb, wcycle);
1369             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1370
1371             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1372         }
1373     }
1374
1375     if (bUseGPU)
1376     {
1377         /* launch D2H copy-back F */
1378         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1379         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1380         if (DOMAINDECOMP(cr))
1381         {
1382             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1383                                      flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1384         }
1385         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1386                                  flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1387         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1388
1389         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1390         {
1391             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1392             fr->gpuBonded->launchEnergyTransfer();
1393             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1394         }
1395         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1396     }
1397
1398     if (bStateChanged && inputrecNeedMutot(inputrec))
1399     {
1400         if (PAR(cr))
1401         {
1402             gmx_sumd(2*DIM, mu, cr);
1403             ddReopenBalanceRegionCpu(cr->dd);
1404         }
1405
1406         for (i = 0; i < 2; i++)
1407         {
1408             for (j = 0; j < DIM; j++)
1409             {
1410                 fr->mu_tot[i][j] = mu[i*DIM + j];
1411             }
1412         }
1413     }
1414     if (fr->efep == efepNO)
1415     {
1416         copy_rvec(fr->mu_tot[0], mu_tot);
1417     }
1418     else
1419     {
1420         for (j = 0; j < DIM; j++)
1421         {
1422             mu_tot[j] =
1423                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1424                 lambda[efptCOUL]*fr->mu_tot[1][j];
1425         }
1426     }
1427
1428     /* Reset energies */
1429     reset_enerdata(enerd);
1430     clear_rvecs(SHIFTS, fr->fshift);
1431
1432     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1433     {
1434         wallcycle_start(wcycle, ewcPPDURINGPME);
1435         dd_force_flop_start(cr->dd, nrnb);
1436     }
1437
1438     if (inputrec->bRot)
1439     {
1440         wallcycle_start(wcycle, ewcROT);
1441         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1442         wallcycle_stop(wcycle, ewcROT);
1443     }
1444
1445     /* Temporary solution until all routines take PaddedRVecVector */
1446     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1447
1448     /* Start the force cycle counter.
1449      * Note that a different counter is used for dynamic load balancing.
1450      */
1451     wallcycle_start(wcycle, ewcFORCE);
1452
1453     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1454     if (bDoForces)
1455     {
1456         /* If we need to compute the virial, we might need a separate
1457          * force buffer for algorithms for which the virial is calculated
1458          * directly, such as PME.
1459          */
1460         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1461         {
1462             forceRef = *fr->forceBufferForDirectVirialContributions;
1463
1464             /* TODO: update comment
1465              * We only compute forces on local atoms. Note that vsites can
1466              * spread to non-local atoms, but that part of the buffer is
1467              * cleared separately in the vsite spreading code.
1468              */
1469             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1470         }
1471         /* Clear the short- and long-range forces */
1472         clear_rvecs_omp(fr->natoms_force_constr, f);
1473     }
1474
1475     /* forceWithVirial uses the local atom range only */
1476     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1477
1478     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1479     {
1480         clear_pull_forces(inputrec->pull_work);
1481     }
1482
1483     /* We calculate the non-bonded forces, when done on the CPU, here.
1484      * We do this before calling do_force_lowlevel, because in that
1485      * function, the listed forces are calculated before PME, which
1486      * does communication.  With this order, non-bonded and listed
1487      * force calculation imbalance can be balanced out by the domain
1488      * decomposition load balancing.
1489      */
1490
1491     if (!bUseOrEmulGPU)
1492     {
1493         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1494                      step, nrnb, wcycle);
1495     }
1496
1497     if (fr->efep != efepNO)
1498     {
1499         /* Calculate the local and non-local free energy interactions here.
1500          * Happens here on the CPU both with and without GPU.
1501          */
1502         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1503         {
1504             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1505                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1506                              inputrec->fepvals, lambda,
1507                              enerd, flags, nrnb, wcycle);
1508         }
1509
1510         if (DOMAINDECOMP(cr) &&
1511             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1512         {
1513             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1514                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1515                              inputrec->fepvals, lambda,
1516                              enerd, flags, nrnb, wcycle);
1517         }
1518     }
1519
1520     if (!bUseOrEmulGPU)
1521     {
1522         int aloc;
1523
1524         if (DOMAINDECOMP(cr))
1525         {
1526             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1527                          step, nrnb, wcycle);
1528         }
1529
1530         if (!bUseOrEmulGPU)
1531         {
1532             aloc = eintLocal;
1533         }
1534         else
1535         {
1536             aloc = eintNonlocal;
1537         }
1538
1539         /* Add all the non-bonded force to the normal force array.
1540          * This can be split into a local and a non-local part when overlapping
1541          * communication with calculation with domain decomposition.
1542          */
1543         wallcycle_stop(wcycle, ewcFORCE);
1544
1545         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1546
1547         wallcycle_start_nocount(wcycle, ewcFORCE);
1548
1549         /* if there are multiple fshift output buffers reduce them */
1550         if ((flags & GMX_FORCE_VIRIAL) &&
1551             nbv->grp[aloc].nbl_lists.nnbl > 1)
1552         {
1553             /* This is not in a subcounter because it takes a
1554                negligible and constant-sized amount of time */
1555             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1556                                                      fr->fshift);
1557         }
1558     }
1559
1560     /* update QMMMrec, if necessary */
1561     if (fr->bQMMM)
1562     {
1563         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1564     }
1565
1566     /* Compute the bonded and non-bonded energies and optionally forces */
1567     do_force_lowlevel(fr, inputrec, &(top->idef),
1568                       cr, ms, nrnb, wcycle, mdatoms,
1569                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1570                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1571                       flags, &cycles_pme);
1572
1573     wallcycle_stop(wcycle, ewcFORCE);
1574
1575     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1576                          step, t, wcycle,
1577                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1578                          flags, &forceWithVirial, enerd,
1579                          ed, bNS);
1580
1581     if (bUseOrEmulGPU)
1582     {
1583         /* wait for non-local forces (or calculate in emulation mode) */
1584         if (DOMAINDECOMP(cr))
1585         {
1586             if (bUseGPU)
1587             {
1588                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1589                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1590                                            flags, eatNonlocal,
1591                                            ppForceWorkload->haveGpuBondedWork,
1592                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1593                                            fr->fshift);
1594                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1595             }
1596             else
1597             {
1598                 wallcycle_start_nocount(wcycle, ewcFORCE);
1599                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1600                              step, nrnb, wcycle);
1601                 wallcycle_stop(wcycle, ewcFORCE);
1602             }
1603
1604             /* skip the reduction if there was no non-local work to do */
1605             if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
1606             {
1607                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1608                                                nbv->nbat, f, wcycle);
1609             }
1610         }
1611     }
1612
1613     if (DOMAINDECOMP(cr))
1614     {
1615         /* We are done with the CPU compute.
1616          * We will now communicate the non-local forces.
1617          * If we use a GPU this will overlap with GPU work, so in that case
1618          * we do not close the DD force balancing region here.
1619          */
1620         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1621         {
1622             ddCloseBalanceRegionCpu(cr->dd);
1623         }
1624         if (bDoForces)
1625         {
1626             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1627         }
1628     }
1629
1630     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1631     // an alternating wait/reduction scheme.
1632     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1633     if (alternateGpuWait)
1634     {
1635         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1636     }
1637
1638     if (!alternateGpuWait && useGpuPme)
1639     {
1640         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1641     }
1642
1643     /* Wait for local GPU NB outputs on the non-alternating wait path */
1644     if (!alternateGpuWait && bUseGPU)
1645     {
1646         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1647          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1648          * but even with a step of 0.1 ms the difference is less than 1%
1649          * of the step time.
1650          */
1651         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1652
1653         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1654         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1655                                    flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1656                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1657                                    fr->fshift);
1658         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1659
1660         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1661         {
1662             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1663             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1664             {
1665                 /* We measured few cycles, it could be that the kernel
1666                  * and transfer finished earlier and there was no actual
1667                  * wait time, only API call overhead.
1668                  * Then the actual time could be anywhere between 0 and
1669                  * cycles_wait_est. We will use half of cycles_wait_est.
1670                  */
1671                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1672             }
1673             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1674         }
1675     }
1676
1677     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1678     {
1679         // NOTE: emulation kernel is not included in the balancing region,
1680         // but emulation mode does not target performance anyway
1681         wallcycle_start_nocount(wcycle, ewcFORCE);
1682         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1683                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1684                      step, nrnb, wcycle);
1685         wallcycle_stop(wcycle, ewcFORCE);
1686     }
1687
1688     if (useGpuPme)
1689     {
1690         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1691     }
1692
1693     if (bUseGPU)
1694     {
1695         /* now clear the GPU outputs while we finish the step on the CPU */
1696         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1697         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1698         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1699
1700         /* Is dynamic pair-list pruning activated? */
1701         if (nbv->listParams->useDynamicPruning)
1702         {
1703             launchGpuRollingPruning(cr, nbv, inputrec, step);
1704         }
1705         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1706         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1707     }
1708
1709     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1710     {
1711         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1712         // in principle this should be included in the DD balancing region,
1713         // but generally it is infrequent so we'll omit it for the sake of
1714         // simpler code
1715         fr->gpuBonded->accumulateEnergyTerms(enerd);
1716         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1717
1718         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1719         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1720         fr->gpuBonded->clearEnergies();
1721         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1722         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1723     }
1724
1725     /* Do the nonbonded GPU (or emulation) force buffer reduction
1726      * on the non-alternating path. */
1727     if (bUseOrEmulGPU && !alternateGpuWait)
1728     {
1729         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1730                                        nbv->nbat, f, wcycle);
1731     }
1732     if (DOMAINDECOMP(cr))
1733     {
1734         dd_force_flop_stop(cr->dd, nrnb);
1735     }
1736
1737     if (bDoForces)
1738     {
1739         /* If we have NoVirSum forces, but we do not calculate the virial,
1740          * we sum fr->f_novirsum=f later.
1741          */
1742         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1743         {
1744             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1745                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1746         }
1747
1748         if (flags & GMX_FORCE_VIRIAL)
1749         {
1750             /* Calculation of the virial must be done after vsites! */
1751             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1752                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1753         }
1754     }
1755
1756     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1757     {
1758         /* In case of node-splitting, the PP nodes receive the long-range
1759          * forces, virial and energy from the PME nodes here.
1760          */
1761         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1762     }
1763
1764     if (bDoForces)
1765     {
1766         post_process_forces(cr, step, nrnb, wcycle,
1767                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1768                             vir_force, mdatoms, graph, fr, vsite,
1769                             flags);
1770     }
1771
1772     if (flags & GMX_FORCE_ENERGY)
1773     {
1774         /* Sum the potential energy terms from group contributions */
1775         sum_epot(&(enerd->grpp), enerd->term);
1776
1777         if (!EI_TPI(inputrec->eI))
1778         {
1779             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1780         }
1781     }
1782 }
1783
1784 static void do_force_cutsGROUP(FILE *fplog,
1785                                const t_commrec *cr,
1786                                const gmx_multisim_t *ms,
1787                                const t_inputrec *inputrec,
1788                                gmx::Awh *awh,
1789                                gmx_enfrot *enforcedRotation,
1790                                int64_t step,
1791                                t_nrnb *nrnb,
1792                                gmx_wallcycle_t wcycle,
1793                                gmx_localtop_t *top,
1794                                const gmx_groups_t *groups,
1795                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1796                                history_t *hist,
1797                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1798                                tensor vir_force,
1799                                const t_mdatoms *mdatoms,
1800                                gmx_enerdata_t *enerd,
1801                                t_fcdata *fcd,
1802                                real *lambda,
1803                                t_graph *graph,
1804                                t_forcerec *fr,
1805                                const gmx_vsite_t *vsite,
1806                                rvec mu_tot,
1807                                double t,
1808                                gmx_edsam *ed,
1809                                int flags,
1810                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1811                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1812 {
1813     int        cg0, cg1, i, j;
1814     double     mu[2*DIM];
1815     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1816     gmx_bool   bDoForces;
1817     float      cycles_pme;
1818
1819     const int  start  = 0;
1820     const int  homenr = mdatoms->homenr;
1821
1822     clear_mat(vir_force);
1823
1824     cg0 = 0;
1825     if (DOMAINDECOMP(cr))
1826     {
1827         cg1 = cr->dd->globalAtomGroupIndices.size();
1828     }
1829     else
1830     {
1831         cg1 = top->cgs.nr;
1832     }
1833     if (fr->n_tpi > 0)
1834     {
1835         cg1--;
1836     }
1837
1838     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1839     bNS            = ((flags & GMX_FORCE_NS) != 0);
1840     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1841     bFillGrid      = (bNS && bStateChanged);
1842     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1843     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1844
1845     if (bStateChanged)
1846     {
1847         update_forcerec(fr, box);
1848
1849         if (inputrecNeedMutot(inputrec))
1850         {
1851             /* Calculate total (local) dipole moment in a temporary common array.
1852              * This makes it possible to sum them over nodes faster.
1853              */
1854             calc_mu(start, homenr,
1855                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1856                     mu, mu+DIM);
1857         }
1858     }
1859
1860     if (fr->ePBC != epbcNONE)
1861     {
1862         /* Compute shift vectors every step,
1863          * because of pressure coupling or box deformation!
1864          */
1865         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1866         {
1867             calc_shifts(box, fr->shift_vec);
1868         }
1869
1870         if (bCalcCGCM)
1871         {
1872             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1873                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1874             inc_nrnb(nrnb, eNR_CGCM, homenr);
1875             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1876         }
1877         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1878         {
1879             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1880         }
1881     }
1882     else if (bCalcCGCM)
1883     {
1884         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1885         inc_nrnb(nrnb, eNR_CGCM, homenr);
1886     }
1887
1888     if (bCalcCGCM && gmx_debug_at)
1889     {
1890         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1891     }
1892
1893 #if GMX_MPI
1894     if (!thisRankHasDuty(cr, DUTY_PME))
1895     {
1896         /* Send particle coordinates to the pme nodes.
1897          * Since this is only implemented for domain decomposition
1898          * and domain decomposition does not use the graph,
1899          * we do not need to worry about shifting.
1900          */
1901         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1902                                  lambda[efptCOUL], lambda[efptVDW],
1903                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1904                                  step, wcycle);
1905     }
1906 #endif /* GMX_MPI */
1907
1908     /* Communicate coordinates and sum dipole if necessary */
1909     if (DOMAINDECOMP(cr))
1910     {
1911         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1912
1913         /* No GPU support, no move_x overlap, so reopen the balance region here */
1914         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1915         {
1916             ddReopenBalanceRegionCpu(cr->dd);
1917         }
1918     }
1919
1920     if (inputrecNeedMutot(inputrec))
1921     {
1922         if (bStateChanged)
1923         {
1924             if (PAR(cr))
1925             {
1926                 gmx_sumd(2*DIM, mu, cr);
1927                 ddReopenBalanceRegionCpu(cr->dd);
1928             }
1929             for (i = 0; i < 2; i++)
1930             {
1931                 for (j = 0; j < DIM; j++)
1932                 {
1933                     fr->mu_tot[i][j] = mu[i*DIM + j];
1934                 }
1935             }
1936         }
1937         if (fr->efep == efepNO)
1938         {
1939             copy_rvec(fr->mu_tot[0], mu_tot);
1940         }
1941         else
1942         {
1943             for (j = 0; j < DIM; j++)
1944             {
1945                 mu_tot[j] =
1946                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1947             }
1948         }
1949     }
1950
1951     /* Reset energies */
1952     reset_enerdata(enerd);
1953     clear_rvecs(SHIFTS, fr->fshift);
1954
1955     if (bNS)
1956     {
1957         wallcycle_start(wcycle, ewcNS);
1958
1959         if (graph && bStateChanged)
1960         {
1961             /* Calculate intramolecular shift vectors to make molecules whole */
1962             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1963         }
1964
1965         /* Do the actual neighbour searching */
1966         ns(fplog, fr, box,
1967            groups, top, mdatoms,
1968            cr, nrnb, bFillGrid);
1969
1970         wallcycle_stop(wcycle, ewcNS);
1971     }
1972
1973     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1974     {
1975         wallcycle_start(wcycle, ewcPPDURINGPME);
1976         dd_force_flop_start(cr->dd, nrnb);
1977     }
1978
1979     if (inputrec->bRot)
1980     {
1981         wallcycle_start(wcycle, ewcROT);
1982         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1983         wallcycle_stop(wcycle, ewcROT);
1984     }
1985
1986     /* Temporary solution until all routines take PaddedRVecVector */
1987     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1988
1989     /* Start the force cycle counter.
1990      * Note that a different counter is used for dynamic load balancing.
1991      */
1992     wallcycle_start(wcycle, ewcFORCE);
1993
1994     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1995     if (bDoForces)
1996     {
1997         /* If we need to compute the virial, we might need a separate
1998          * force buffer for algorithms for which the virial is calculated
1999          * separately, such as PME.
2000          */
2001         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
2002         {
2003             forceRef = *fr->forceBufferForDirectVirialContributions;
2004
2005             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
2006         }
2007
2008         /* Clear the short- and long-range forces */
2009         clear_rvecs(fr->natoms_force_constr, f);
2010     }
2011
2012     /* forceWithVirial might need the full force atom range */
2013     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
2014
2015     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
2016     {
2017         clear_pull_forces(inputrec->pull_work);
2018     }
2019
2020     /* update QMMMrec, if necessary */
2021     if (fr->bQMMM)
2022     {
2023         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
2024     }
2025
2026     /* Compute the bonded and non-bonded energies and optionally forces */
2027     do_force_lowlevel(fr, inputrec, &(top->idef),
2028                       cr, ms, nrnb, wcycle, mdatoms,
2029                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
2030                       box, inputrec->fepvals, lambda,
2031                       graph, &(top->excls), fr->mu_tot,
2032                       flags,
2033                       &cycles_pme);
2034
2035     wallcycle_stop(wcycle, ewcFORCE);
2036
2037     if (DOMAINDECOMP(cr))
2038     {
2039         dd_force_flop_stop(cr->dd, nrnb);
2040
2041         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2042         {
2043             ddCloseBalanceRegionCpu(cr->dd);
2044         }
2045     }
2046
2047     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2048                          step, t, wcycle,
2049                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
2050                          flags, &forceWithVirial, enerd,
2051                          ed, bNS);
2052
2053     if (bDoForces)
2054     {
2055         /* Communicate the forces */
2056         if (DOMAINDECOMP(cr))
2057         {
2058             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
2059             /* Do we need to communicate the separate force array
2060              * for terms that do not contribute to the single sum virial?
2061              * Position restraints and electric fields do not introduce
2062              * inter-cg forces, only full electrostatics methods do.
2063              * When we do not calculate the virial, fr->f_novirsum = f,
2064              * so we have already communicated these forces.
2065              */
2066             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2067                 (flags & GMX_FORCE_VIRIAL))
2068             {
2069                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2070             }
2071         }
2072
2073         /* If we have NoVirSum forces, but we do not calculate the virial,
2074          * we sum fr->f_novirsum=f later.
2075          */
2076         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2077         {
2078             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
2079                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2080         }
2081
2082         if (flags & GMX_FORCE_VIRIAL)
2083         {
2084             /* Calculation of the virial must be done after vsites! */
2085             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
2086                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2087         }
2088     }
2089
2090     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2091     {
2092         /* In case of node-splitting, the PP nodes receive the long-range
2093          * forces, virial and energy from the PME nodes here.
2094          */
2095         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2096     }
2097
2098     if (bDoForces)
2099     {
2100         post_process_forces(cr, step, nrnb, wcycle,
2101                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2102                             vir_force, mdatoms, graph, fr, vsite,
2103                             flags);
2104     }
2105
2106     if (flags & GMX_FORCE_ENERGY)
2107     {
2108         /* Sum the potential energy terms from group contributions */
2109         sum_epot(&(enerd->grpp), enerd->term);
2110
2111         if (!EI_TPI(inputrec->eI))
2112         {
2113             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2114         }
2115     }
2116
2117 }
2118
2119 void do_force(FILE                                     *fplog,
2120               const t_commrec                          *cr,
2121               const gmx_multisim_t                     *ms,
2122               const t_inputrec                         *inputrec,
2123               gmx::Awh                                 *awh,
2124               gmx_enfrot                               *enforcedRotation,
2125               int64_t                                   step,
2126               t_nrnb                                   *nrnb,
2127               gmx_wallcycle_t                           wcycle,
2128               gmx_localtop_t                           *top,
2129               const gmx_groups_t                       *groups,
2130               matrix                                    box,
2131               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
2132               history_t                                *hist,
2133               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
2134               tensor                                    vir_force,
2135               const t_mdatoms                          *mdatoms,
2136               gmx_enerdata_t                           *enerd,
2137               t_fcdata                                 *fcd,
2138               gmx::ArrayRef<real>                       lambda,
2139               t_graph                                  *graph,
2140               t_forcerec                               *fr,
2141               gmx::PpForceWorkload                     *ppForceWorkload,
2142               const gmx_vsite_t                        *vsite,
2143               rvec                                      mu_tot,
2144               double                                    t,
2145               gmx_edsam                                *ed,
2146               int                                       flags,
2147               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2148               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2149 {
2150     /* modify force flag if not doing nonbonded */
2151     if (!fr->bNonbonded)
2152     {
2153         flags &= ~GMX_FORCE_NONBONDED;
2154     }
2155
2156     switch (inputrec->cutoff_scheme)
2157     {
2158         case ecutsVERLET:
2159             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2160                                 awh, enforcedRotation, step, nrnb, wcycle,
2161                                 top,
2162                                 groups,
2163                                 box, x, hist,
2164                                 force, vir_force,
2165                                 mdatoms,
2166                                 enerd, fcd,
2167                                 lambda.data(), graph,
2168                                 fr,
2169                                 ppForceWorkload,
2170                                 fr->ic,
2171                                 vsite, mu_tot,
2172                                 t, ed,
2173                                 flags,
2174                                 ddOpenBalanceRegion,
2175                                 ddCloseBalanceRegion);
2176             break;
2177         case ecutsGROUP:
2178             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2179                                awh, enforcedRotation, step, nrnb, wcycle,
2180                                top,
2181                                groups,
2182                                box, x, hist,
2183                                force, vir_force,
2184                                mdatoms,
2185                                enerd, fcd,
2186                                lambda.data(), graph,
2187                                fr, vsite, mu_tot,
2188                                t, ed,
2189                                flags,
2190                                ddOpenBalanceRegion,
2191                                ddCloseBalanceRegion);
2192             break;
2193         default:
2194             gmx_incons("Invalid cut-off scheme passed!");
2195     }
2196
2197     /* In case we don't have constraints and are using GPUs, the next balancing
2198      * region starts here.
2199      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2200      * virial calculation and COM pulling, is not thus not included in
2201      * the balance timing, which is ok as most tasks do communication.
2202      */
2203     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2204     {
2205         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2206     }
2207 }
2208
2209
2210 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2211                         const t_inputrec *ir, const t_mdatoms *md,
2212                         t_state *state)
2213 {
2214     int             i, m, start, end;
2215     int64_t         step;
2216     real            dt = ir->delta_t;
2217     real            dvdl_dum;
2218     rvec           *savex;
2219
2220     /* We need to allocate one element extra, since we might use
2221      * (unaligned) 4-wide SIMD loads to access rvec entries.
2222      */
2223     snew(savex, state->natoms + 1);
2224
2225     start = 0;
2226     end   = md->homenr;
2227
2228     if (debug)
2229     {
2230         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2231                 start, md->homenr, end);
2232     }
2233     /* Do a first constrain to reset particles... */
2234     step = ir->init_step;
2235     if (fplog)
2236     {
2237         char buf[STEPSTRSIZE];
2238         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2239                 gmx_step_str(step, buf));
2240     }
2241     dvdl_dum = 0;
2242
2243     /* constrain the current position */
2244     constr->apply(TRUE, FALSE,
2245                   step, 0, 1.0,
2246                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2247                   state->box,
2248                   state->lambda[efptBONDED], &dvdl_dum,
2249                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2250     if (EI_VV(ir->eI))
2251     {
2252         /* constrain the inital velocity, and save it */
2253         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2254         constr->apply(TRUE, FALSE,
2255                       step, 0, 1.0,
2256                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2257                       state->box,
2258                       state->lambda[efptBONDED], &dvdl_dum,
2259                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2260     }
2261     /* constrain the inital velocities at t-dt/2 */
2262     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2263     {
2264         auto x = makeArrayRef(state->x).subArray(start, end);
2265         auto v = makeArrayRef(state->v).subArray(start, end);
2266         for (i = start; (i < end); i++)
2267         {
2268             for (m = 0; (m < DIM); m++)
2269             {
2270                 /* Reverse the velocity */
2271                 v[i][m] = -v[i][m];
2272                 /* Store the position at t-dt in buf */
2273                 savex[i][m] = x[i][m] + dt*v[i][m];
2274             }
2275         }
2276         /* Shake the positions at t=-dt with the positions at t=0
2277          * as reference coordinates.
2278          */
2279         if (fplog)
2280         {
2281             char buf[STEPSTRSIZE];
2282             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2283                     gmx_step_str(step, buf));
2284         }
2285         dvdl_dum = 0;
2286         constr->apply(TRUE, FALSE,
2287                       step, -1, 1.0,
2288                       state->x.rvec_array(), savex, nullptr,
2289                       state->box,
2290                       state->lambda[efptBONDED], &dvdl_dum,
2291                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2292
2293         for (i = start; i < end; i++)
2294         {
2295             for (m = 0; m < DIM; m++)
2296             {
2297                 /* Re-reverse the velocities */
2298                 v[i][m] = -v[i][m];
2299             }
2300         }
2301     }
2302     sfree(savex);
2303 }
2304
2305
2306 static void
2307 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2308                 double *enerout, double *virout)
2309 {
2310     double enersum, virsum;
2311     double invscale, invscale2, invscale3;
2312     double r, ea, eb, ec, pa, pb, pc, pd;
2313     double y0, f, g, h;
2314     int    ri, offset;
2315     double tabfactor;
2316
2317     invscale  = 1.0/scale;
2318     invscale2 = invscale*invscale;
2319     invscale3 = invscale*invscale2;
2320
2321     /* Following summation derived from cubic spline definition,
2322      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2323      * the cubic spline.  We first calculate the negative of the
2324      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2325      * add the more standard, abrupt cutoff correction to that result,
2326      * yielding the long-range correction for a switched function.  We
2327      * perform both the pressure and energy loops at the same time for
2328      * simplicity, as the computational cost is low. */
2329
2330     if (offstart == 0)
2331     {
2332         /* Since the dispersion table has been scaled down a factor
2333          * 6.0 and the repulsion a factor 12.0 to compensate for the
2334          * c6/c12 parameters inside nbfp[] being scaled up (to save
2335          * flops in kernels), we need to correct for this.
2336          */
2337         tabfactor = 6.0;
2338     }
2339     else
2340     {
2341         tabfactor = 12.0;
2342     }
2343
2344     enersum = 0.0;
2345     virsum  = 0.0;
2346     for (ri = rstart; ri < rend; ++ri)
2347     {
2348         r  = ri*invscale;
2349         ea = invscale3;
2350         eb = 2.0*invscale2*r;
2351         ec = invscale*r*r;
2352
2353         pa = invscale3;
2354         pb = 3.0*invscale2*r;
2355         pc = 3.0*invscale*r*r;
2356         pd = r*r*r;
2357
2358         /* this "8" is from the packing in the vdwtab array - perhaps
2359            should be defined? */
2360
2361         offset = 8*ri + offstart;
2362         y0     = vdwtab[offset];
2363         f      = vdwtab[offset+1];
2364         g      = vdwtab[offset+2];
2365         h      = vdwtab[offset+3];
2366
2367         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2368         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2369     }
2370     *enerout = 4.0*M_PI*enersum*tabfactor;
2371     *virout  = 4.0*M_PI*virsum*tabfactor;
2372 }
2373
2374 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2375 {
2376     double   eners[2], virs[2], enersum, virsum;
2377     double   r0, rc3, rc9;
2378     int      ri0, ri1, i;
2379     real     scale, *vdwtab;
2380
2381     fr->enershiftsix    = 0;
2382     fr->enershifttwelve = 0;
2383     fr->enerdiffsix     = 0;
2384     fr->enerdifftwelve  = 0;
2385     fr->virdiffsix      = 0;
2386     fr->virdifftwelve   = 0;
2387
2388     const interaction_const_t *ic = fr->ic;
2389
2390     if (eDispCorr != edispcNO)
2391     {
2392         for (i = 0; i < 2; i++)
2393         {
2394             eners[i] = 0;
2395             virs[i]  = 0;
2396         }
2397         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2398             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2399             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2400             (ic->vdwtype == evdwSHIFT) ||
2401             (ic->vdwtype == evdwSWITCH))
2402         {
2403             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2404                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2405                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2406             {
2407                 gmx_fatal(FARGS,
2408                           "With dispersion correction rvdw-switch can not be zero "
2409                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2410             }
2411
2412             /* TODO This code depends on the logic in tables.c that
2413                constructs the table layout, which should be made
2414                explicit in future cleanup. */
2415             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2416                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2417             scale  = fr->dispersionCorrectionTable->scale;
2418             vdwtab = fr->dispersionCorrectionTable->data;
2419
2420             /* Round the cut-offs to exact table values for precision */
2421             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2422             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2423
2424             /* The code below has some support for handling force-switching, i.e.
2425              * when the force (instead of potential) is switched over a limited
2426              * region. This leads to a constant shift in the potential inside the
2427              * switching region, which we can handle by adding a constant energy
2428              * term in the force-switch case just like when we do potential-shift.
2429              *
2430              * For now this is not enabled, but to keep the functionality in the
2431              * code we check separately for switch and shift. When we do force-switch
2432              * the shifting point is rvdw_switch, while it is the cutoff when we
2433              * have a classical potential-shift.
2434              *
2435              * For a pure potential-shift the potential has a constant shift
2436              * all the way out to the cutoff, and that is it. For other forms
2437              * we need to calculate the constant shift up to the point where we
2438              * start modifying the potential.
2439              */
2440             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2441
2442             r0   = ri0/scale;
2443             rc3  = r0*r0*r0;
2444             rc9  = rc3*rc3*rc3;
2445
2446             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2447                 (ic->vdwtype == evdwSHIFT))
2448             {
2449                 /* Determine the constant energy shift below rvdw_switch.
2450                  * Table has a scale factor since we have scaled it down to compensate
2451                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2452                  */
2453                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2454                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2455             }
2456             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2457             {
2458                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2459                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2460             }
2461
2462             /* Add the constant part from 0 to rvdw_switch.
2463              * This integration from 0 to rvdw_switch overcounts the number
2464              * of interactions by 1, as it also counts the self interaction.
2465              * We will correct for this later.
2466              */
2467             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2468             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2469
2470             /* Calculate the contribution in the range [r0,r1] where we
2471              * modify the potential. For a pure potential-shift modifier we will
2472              * have ri0==ri1, and there will not be any contribution here.
2473              */
2474             for (i = 0; i < 2; i++)
2475             {
2476                 enersum = 0;
2477                 virsum  = 0;
2478                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2479                 eners[i] -= enersum;
2480                 virs[i]  -= virsum;
2481             }
2482
2483             /* Alright: Above we compensated by REMOVING the parts outside r0
2484              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2485              *
2486              * Regardless of whether r0 is the point where we start switching,
2487              * or the cutoff where we calculated the constant shift, we include
2488              * all the parts we are missing out to infinity from r0 by
2489              * calculating the analytical dispersion correction.
2490              */
2491             eners[0] += -4.0*M_PI/(3.0*rc3);
2492             eners[1] +=  4.0*M_PI/(9.0*rc9);
2493             virs[0]  +=  8.0*M_PI/rc3;
2494             virs[1]  += -16.0*M_PI/(3.0*rc9);
2495         }
2496         else if (ic->vdwtype == evdwCUT ||
2497                  EVDW_PME(ic->vdwtype) ||
2498                  ic->vdwtype == evdwUSER)
2499         {
2500             if (ic->vdwtype == evdwUSER && fplog)
2501             {
2502                 fprintf(fplog,
2503                         "WARNING: using dispersion correction with user tables\n");
2504             }
2505
2506             /* Note that with LJ-PME, the dispersion correction is multiplied
2507              * by the difference between the actual C6 and the value of C6
2508              * that would produce the combination rule.
2509              * This means the normal energy and virial difference formulas
2510              * can be used here.
2511              */
2512
2513             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2514             rc9  = rc3*rc3*rc3;
2515             /* Contribution beyond the cut-off */
2516             eners[0] += -4.0*M_PI/(3.0*rc3);
2517             eners[1] +=  4.0*M_PI/(9.0*rc9);
2518             if (ic->vdw_modifier == eintmodPOTSHIFT)
2519             {
2520                 /* Contribution within the cut-off */
2521                 eners[0] += -4.0*M_PI/(3.0*rc3);
2522                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2523             }
2524             /* Contribution beyond the cut-off */
2525             virs[0]  +=  8.0*M_PI/rc3;
2526             virs[1]  += -16.0*M_PI/(3.0*rc9);
2527         }
2528         else
2529         {
2530             gmx_fatal(FARGS,
2531                       "Dispersion correction is not implemented for vdw-type = %s",
2532                       evdw_names[ic->vdwtype]);
2533         }
2534
2535         /* When we deprecate the group kernels the code below can go too */
2536         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2537         {
2538             /* Calculate self-interaction coefficient (assuming that
2539              * the reciprocal-space contribution is constant in the
2540              * region that contributes to the self-interaction).
2541              */
2542             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2543
2544             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2545             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2546         }
2547
2548         fr->enerdiffsix    = eners[0];
2549         fr->enerdifftwelve = eners[1];
2550         /* The 0.5 is due to the Gromacs definition of the virial */
2551         fr->virdiffsix     = 0.5*virs[0];
2552         fr->virdifftwelve  = 0.5*virs[1];
2553     }
2554 }
2555
2556 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2557                    const matrix box, real lambda, tensor pres, tensor virial,
2558                    real *prescorr, real *enercorr, real *dvdlcorr)
2559 {
2560     gmx_bool bCorrAll, bCorrPres;
2561     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2562     int      m;
2563
2564     *prescorr = 0;
2565     *enercorr = 0;
2566     *dvdlcorr = 0;
2567
2568     clear_mat(virial);
2569     clear_mat(pres);
2570
2571     if (ir->eDispCorr != edispcNO)
2572     {
2573         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2574                      ir->eDispCorr == edispcAllEnerPres);
2575         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2576                      ir->eDispCorr == edispcAllEnerPres);
2577
2578         invvol = 1/det(box);
2579         if (fr->n_tpi)
2580         {
2581             /* Only correct for the interactions with the inserted molecule */
2582             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2583             ninter = fr->n_tpi;
2584         }
2585         else
2586         {
2587             dens   = fr->numAtomsForDispersionCorrection*invvol;
2588             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2589         }
2590
2591         if (ir->efep == efepNO)
2592         {
2593             avcsix    = fr->avcsix[0];
2594             avctwelve = fr->avctwelve[0];
2595         }
2596         else
2597         {
2598             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2599             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2600         }
2601
2602         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2603         *enercorr += avcsix*enerdiff;
2604         dvdlambda  = 0.0;
2605         if (ir->efep != efepNO)
2606         {
2607             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2608         }
2609         if (bCorrAll)
2610         {
2611             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2612             *enercorr += avctwelve*enerdiff;
2613             if (fr->efep != efepNO)
2614             {
2615                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2616             }
2617         }
2618
2619         if (bCorrPres)
2620         {
2621             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2622             if (ir->eDispCorr == edispcAllEnerPres)
2623             {
2624                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2625             }
2626             /* The factor 2 is because of the Gromacs virial definition */
2627             spres = -2.0*invvol*svir*PRESFAC;
2628
2629             for (m = 0; m < DIM; m++)
2630             {
2631                 virial[m][m] += svir;
2632                 pres[m][m]   += spres;
2633             }
2634             *prescorr += spres;
2635         }
2636
2637         /* Can't currently control when it prints, for now, just print when degugging */
2638         if (debug)
2639         {
2640             if (bCorrAll)
2641             {
2642                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2643                         avcsix, avctwelve);
2644             }
2645             if (bCorrPres)
2646             {
2647                 fprintf(debug,
2648                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2649                         *enercorr, spres, svir);
2650             }
2651             else
2652             {
2653                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2654             }
2655         }
2656
2657         if (fr->efep != efepNO)
2658         {
2659             *dvdlcorr += dvdlambda;
2660         }
2661     }
2662 }
2663
2664 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2665                             const gmx_mtop_t *mtop, rvec x[],
2666                             gmx_bool bFirst)
2667 {
2668     t_graph        *graph;
2669     int             as, mol;
2670
2671     if (bFirst && fplog)
2672     {
2673         fprintf(fplog, "Removing pbc first time\n");
2674     }
2675
2676     snew(graph, 1);
2677     as = 0;
2678     for (const gmx_molblock_t &molb : mtop->molblock)
2679     {
2680         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2681         if (moltype.atoms.nr == 1 ||
2682             (!bFirst && moltype.cgs.nr == 1))
2683         {
2684             /* Just one atom or charge group in the molecule, no PBC required */
2685             as += molb.nmol*moltype.atoms.nr;
2686         }
2687         else
2688         {
2689             mk_graph_moltype(moltype, graph);
2690
2691             for (mol = 0; mol < molb.nmol; mol++)
2692             {
2693                 mk_mshift(fplog, graph, ePBC, box, x+as);
2694
2695                 shift_self(graph, box, x+as);
2696                 /* The molecule is whole now.
2697                  * We don't need the second mk_mshift call as in do_pbc_first,
2698                  * since we no longer need this graph.
2699                  */
2700
2701                 as += moltype.atoms.nr;
2702             }
2703             done_graph(graph);
2704         }
2705     }
2706     sfree(graph);
2707 }
2708
2709 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2710                        const gmx_mtop_t *mtop, rvec x[])
2711 {
2712     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2713 }
2714
2715 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2716                  const gmx_mtop_t *mtop, rvec x[])
2717 {
2718     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2719 }
2720
2721 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2722 {
2723     int t, nth;
2724     nth = gmx_omp_nthreads_get(emntDefault);
2725
2726 #pragma omp parallel for num_threads(nth) schedule(static)
2727     for (t = 0; t < nth; t++)
2728     {
2729         try
2730         {
2731             size_t natoms = x.size();
2732             size_t offset = (natoms*t    )/nth;
2733             size_t len    = (natoms*(t + 1))/nth - offset;
2734             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2735         }
2736         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2737     }
2738 }
2739
2740 // TODO This can be cleaned up a lot, and move back to runner.cpp
2741 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2742                 const t_inputrec *inputrec,
2743                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2744                 gmx_walltime_accounting_t walltime_accounting,
2745                 nonbonded_verlet_t *nbv,
2746                 const gmx_pme_t *pme,
2747                 gmx_bool bWriteStat)
2748 {
2749     t_nrnb *nrnb_tot = nullptr;
2750     double  delta_t  = 0;
2751     double  nbfs     = 0, mflop = 0;
2752     double  elapsed_time,
2753             elapsed_time_over_all_ranks,
2754             elapsed_time_over_all_threads,
2755             elapsed_time_over_all_threads_over_all_ranks;
2756     /* Control whether it is valid to print a report. Only the
2757        simulation master may print, but it should not do so if the run
2758        terminated e.g. before a scheduled reset step. This is
2759        complicated by the fact that PME ranks are unaware of the
2760        reason why they were sent a pmerecvqxFINISH. To avoid
2761        communication deadlocks, we always do the communication for the
2762        report, even if we've decided not to write the report, because
2763        how long it takes to finish the run is not important when we've
2764        decided not to report on the simulation performance.
2765
2766        Further, we only report performance for dynamical integrators,
2767        because those are the only ones for which we plan to
2768        consider doing any optimizations. */
2769     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2770
2771     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2772     {
2773         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2774         printReport = false;
2775     }
2776
2777     if (cr->nnodes > 1)
2778     {
2779         snew(nrnb_tot, 1);
2780 #if GMX_MPI
2781         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2782                       cr->mpi_comm_mysim);
2783 #endif
2784     }
2785     else
2786     {
2787         nrnb_tot = nrnb;
2788     }
2789
2790     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2791     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2792     if (cr->nnodes > 1)
2793     {
2794 #if GMX_MPI
2795         /* reduce elapsed_time over all MPI ranks in the current simulation */
2796         MPI_Allreduce(&elapsed_time,
2797                       &elapsed_time_over_all_ranks,
2798                       1, MPI_DOUBLE, MPI_SUM,
2799                       cr->mpi_comm_mysim);
2800         elapsed_time_over_all_ranks /= cr->nnodes;
2801         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2802          * current simulation. */
2803         MPI_Allreduce(&elapsed_time_over_all_threads,
2804                       &elapsed_time_over_all_threads_over_all_ranks,
2805                       1, MPI_DOUBLE, MPI_SUM,
2806                       cr->mpi_comm_mysim);
2807 #endif
2808     }
2809     else
2810     {
2811         elapsed_time_over_all_ranks                  = elapsed_time;
2812         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2813     }
2814
2815     if (printReport)
2816     {
2817         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2818     }
2819     if (cr->nnodes > 1)
2820     {
2821         sfree(nrnb_tot);
2822     }
2823
2824     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2825     {
2826         print_dd_statistics(cr, inputrec, fplog);
2827     }
2828
2829     /* TODO Move the responsibility for any scaling by thread counts
2830      * to the code that handled the thread region, so that there's a
2831      * mechanism to keep cycle counting working during the transition
2832      * to task parallelism. */
2833     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2834     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2835     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2836     auto cycle_sum(wallcycle_sum(cr, wcycle));
2837
2838     if (printReport)
2839     {
2840         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2841         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2842         if (pme_gpu_task_enabled(pme))
2843         {
2844             pme_gpu_get_timings(pme, &pme_gpu_timings);
2845         }
2846         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2847                         elapsed_time_over_all_ranks,
2848                         wcycle, cycle_sum,
2849                         nbnxn_gpu_timings,
2850                         &pme_gpu_timings);
2851
2852         if (EI_DYNAMICS(inputrec->eI))
2853         {
2854             delta_t = inputrec->delta_t;
2855         }
2856
2857         if (fplog)
2858         {
2859             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2860                        elapsed_time_over_all_ranks,
2861                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2862                        delta_t, nbfs, mflop);
2863         }
2864         if (bWriteStat)
2865         {
2866             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2867                        elapsed_time_over_all_ranks,
2868                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2869                        delta_t, nbfs, mflop);
2870         }
2871     }
2872 }
2873
2874 void initialize_lambdas(FILE               *fplog,
2875                         const t_inputrec   &ir,
2876                         bool                isMaster,
2877                         int                *fep_state,
2878                         gmx::ArrayRef<real> lambda,
2879                         double             *lam0)
2880 {
2881     /* TODO: Clean up initialization of fep_state and lambda in
2882        t_state.  This function works, but could probably use a logic
2883        rewrite to keep all the different types of efep straight. */
2884
2885     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2886     {
2887         return;
2888     }
2889
2890     const t_lambda *fep = ir.fepvals;
2891     if (isMaster)
2892     {
2893         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2894                                              if checkpoint is set -- a kludge is in for now
2895                                              to prevent this.*/
2896     }
2897
2898     for (int i = 0; i < efptNR; i++)
2899     {
2900         double thisLambda;
2901         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2902         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2903         {
2904             thisLambda = fep->init_lambda;
2905         }
2906         else
2907         {
2908             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2909         }
2910         if (isMaster)
2911         {
2912             lambda[i] = thisLambda;
2913         }
2914         if (lam0 != nullptr)
2915         {
2916             lam0[i] = thisLambda;
2917         }
2918     }
2919     if (ir.bSimTemp)
2920     {
2921         /* need to rescale control temperatures to match current state */
2922         for (int i = 0; i < ir.opts.ngtc; i++)
2923         {
2924             if (ir.opts.ref_t[i] > 0)
2925             {
2926                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2927             }
2928         }
2929     }
2930
2931     /* Send to the log the information on the current lambdas */
2932     if (fplog != nullptr)
2933     {
2934         fprintf(fplog, "Initial vector of lambda components:[ ");
2935         for (const auto &l : lambda)
2936         {
2937             fprintf(fplog, "%10.4f ", l);
2938         }
2939         fprintf(fplog, "]\n");
2940     }
2941 }