src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <cmath>
  44 #include <cstdint>
  45 #include <cstdio>
  46 #include <cstring>
  47
  48 #include <array>
  49
  50 #include "gromacs/awh/awh.h"
  51 #include "gromacs/domdec/dlbtiming.h"
  52 #include "gromacs/domdec/domdec_struct.h"
  53 #include "gromacs/domdec/partition.h"
  54 #include "gromacs/essentialdynamics/edsam.h"
  55 #include "gromacs/ewald/pme.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/imd/imd.h"
  64 #include "gromacs/listed_forces/bonded.h"
  65 #include "gromacs/listed_forces/disre.h"
  66 #include "gromacs/listed_forces/gpubonded.h"
  67 #include "gromacs/listed_forces/manage_threading.h"
  68 #include "gromacs/listed_forces/orires.h"
  69 #include "gromacs/math/arrayrefwithpadding.h"
  70 #include "gromacs/math/functions.h"
  71 #include "gromacs/math/units.h"
  72 #include "gromacs/math/vec.h"
  73 #include "gromacs/math/vecdump.h"
  74 #include "gromacs/mdlib/calcmu.h"
  75 #include "gromacs/mdlib/calcvir.h"
  76 #include "gromacs/mdlib/constr.h"
  77 #include "gromacs/mdlib/force.h"
  78 #include "gromacs/mdlib/forcerec.h"
  79 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  80 #include "gromacs/mdlib/mdrun.h"
  81 #include "gromacs/mdlib/nb_verlet.h"
  82 #include "gromacs/mdlib/nbnxn_atomdata.h"
  83 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  84 #include "gromacs/mdlib/nbnxn_grid.h"
  85 #include "gromacs/mdlib/nbnxn_search.h"
  86 #include "gromacs/mdlib/ppforceworkload.h"
  87 #include "gromacs/mdlib/qmmm.h"
  88 #include "gromacs/mdlib/update.h"
  89 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  90 #include "gromacs/mdtypes/commrec.h"
  91 #include "gromacs/mdtypes/forceoutput.h"
  92 #include "gromacs/mdtypes/iforceprovider.h"
  93 #include "gromacs/mdtypes/inputrec.h"
  94 #include "gromacs/mdtypes/md_enums.h"
  95 #include "gromacs/mdtypes/state.h"
  96 #include "gromacs/pbcutil/ishift.h"
  97 #include "gromacs/pbcutil/mshift.h"
  98 #include "gromacs/pbcutil/pbc.h"
  99 #include "gromacs/pulling/pull.h"
 100 #include "gromacs/pulling/pull_rotation.h"
 101 #include "gromacs/timing/cyclecounter.h"
 102 #include "gromacs/timing/gpu_timing.h"
 103 #include "gromacs/timing/wallcycle.h"
 104 #include "gromacs/timing/wallcyclereporting.h"
 105 #include "gromacs/timing/walltime_accounting.h"
 106 #include "gromacs/topology/topology.h"
 107 #include "gromacs/utility/arrayref.h"
 108 #include "gromacs/utility/basedefinitions.h"
 109 #include "gromacs/utility/cstringutil.h"
 110 #include "gromacs/utility/exceptions.h"
 111 #include "gromacs/utility/fatalerror.h"
 112 #include "gromacs/utility/gmxassert.h"
 113 #include "gromacs/utility/gmxmpi.h"
 114 #include "gromacs/utility/logger.h"
 115 #include "gromacs/utility/smalloc.h"
 116 #include "gromacs/utility/strconvert.h"
 117 #include "gromacs/utility/sysinfo.h"
 118
 119 #include "nbnxn_gpu.h"
 120 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 121 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 122
 123 // TODO: this environment variable allows us to verify before release
 124 // that on less common architectures the total cost of polling is not larger than
 125 // a blocking wait (so polling does not introduce overhead when the static
 126 // PME-first ordering would suffice).
 127 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 128
 129
 130 void print_time(FILE                     *out,
 131                 gmx_walltime_accounting_t walltime_accounting,
 132                 int64_t                   step,
 133                 const t_inputrec         *ir,
 134                 const t_commrec          *cr)
 135 {
 136     time_t finish;
 137     double dt, elapsed_seconds, time_per_step;
 138
 139 #if !GMX_THREAD_MPI
 140     if (!PAR(cr))
 141 #endif
 142     {
 143         fprintf(out, "\r");
 144     }
 145     fputs("step ", out);
 146     fputs(gmx::int64ToString(step).c_str(), out);
 147     fflush(out);
 148
 149     if ((step >= ir->nstlist))
 150     {
 151         double seconds_since_epoch = gmx_gettime();
 152         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 153         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 154         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 155
 156         if (ir->nsteps >= 0)
 157         {
 158             if (dt >= 300)
 159             {
 160                 finish = static_cast<time_t>(seconds_since_epoch + dt);
 161                 auto timebuf = gmx_ctime_r(&finish);
 162                 timebuf.erase(timebuf.find_first_of('\n'));
 163                 fputs(", will finish ", out);
 164                 fputs(timebuf.c_str(), out);
 165             }
 166             else
 167             {
 168                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
 169             }
 170         }
 171         else
 172         {
 173             fprintf(out, " performance: %.1f ns/day    ",
 174                     ir->delta_t/1000*24*60*60/time_per_step);
 175         }
 176     }
 177 #if !GMX_THREAD_MPI
 178     if (PAR(cr))
 179     {
 180         fprintf(out, "\n");
 181     }
 182 #else
 183     GMX_UNUSED_VALUE(cr);
 184 #endif
 185
 186     fflush(out);
 187 }
 188
 189 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 190                          double the_time)
 191 {
 192     if (!fplog)
 193     {
 194         return;
 195     }
 196
 197     time_t temp_time = static_cast<time_t>(the_time);
 198
 199     auto   timebuf = gmx_ctime_r(&temp_time);
 200
 201     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
 202 }
 203
 204 void print_start(FILE *fplog, const t_commrec *cr,
 205                  gmx_walltime_accounting_t walltime_accounting,
 206                  const char *name)
 207 {
 208     char buf[STRLEN];
 209
 210     sprintf(buf, "Started %s", name);
 211     print_date_and_time(fplog, cr->nodeid, buf,
 212                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 213 }
 214
 215 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 216 {
 217     const int      end = forceToAdd.size();
 218
 219     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 220 #pragma omp parallel for num_threads(nt) schedule(static)
 221     for (int i = 0; i < end; i++)
 222     {
 223         rvec_inc(f[i], forceToAdd[i]);
 224     }
 225 }
 226
 227 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
 228                         tensor vir_part, const t_graph *graph, const matrix box,
 229                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 230 {
 231     /* The short-range virial from surrounding boxes */
 232     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 233     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 234
 235     /* Calculate partial virial, for local atoms only, based on short range.
 236      * Total virial is computed in global_stat, called from do_md
 237      */
 238     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 239     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 240
 241     if (debug)
 242     {
 243         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 244     }
 245 }
 246
 247 static void pull_potential_wrapper(const t_commrec *cr,
 248                                    const t_inputrec *ir,
 249                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
 250                                    gmx::ForceWithVirial *force,
 251                                    const t_mdatoms *mdatoms,
 252                                    gmx_enerdata_t *enerd,
 253                                    const real *lambda,
 254                                    double t,
 255                                    gmx_wallcycle_t wcycle)
 256 {
 257     t_pbc  pbc;
 258     real   dvdl;
 259
 260     /* Calculate the center of mass forces, this requires communication,
 261      * which is why pull_potential is called close to other communication.
 262      */
 263     wallcycle_start(wcycle, ewcPULLPOT);
 264     set_pbc(&pbc, ir->ePBC, box);
 265     dvdl                     = 0;
 266     enerd->term[F_COM_PULL] +=
 267         pull_potential(ir->pull_work, mdatoms, &pbc,
 268                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 269     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 270     wallcycle_stop(wcycle, ewcPULLPOT);
 271 }
 272
 273 static void pme_receive_force_ener(const t_commrec      *cr,
 274                                    gmx::ForceWithVirial *forceWithVirial,
 275                                    gmx_enerdata_t       *enerd,
 276                                    gmx_wallcycle_t       wcycle)
 277 {
 278     real   e_q, e_lj, dvdl_q, dvdl_lj;
 279     float  cycles_ppdpme, cycles_seppme;
 280
 281     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 282     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 283
 284     /* In case of node-splitting, the PP nodes receive the long-range
 285      * forces, virial and energy from the PME nodes here.
 286      */
 287     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 288     dvdl_q  = 0;
 289     dvdl_lj = 0;
 290     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 291                       &cycles_seppme);
 292     enerd->term[F_COUL_RECIP] += e_q;
 293     enerd->term[F_LJ_RECIP]   += e_lj;
 294     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 295     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 296
 297     if (wcycle)
 298     {
 299         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 300     }
 301     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 302 }
 303
 304 static void print_large_forces(FILE            *fp,
 305                                const t_mdatoms *md,
 306                                const t_commrec *cr,
 307                                int64_t          step,
 308                                real             forceTolerance,
 309                                const rvec      *x,
 310                                const rvec      *f)
 311 {
 312     real           force2Tolerance = gmx::square(forceTolerance);
 313     gmx::index     numNonFinite    = 0;
 314     for (int i = 0; i < md->homenr; i++)
 315     {
 316         real force2    = norm2(f[i]);
 317         bool nonFinite = !std::isfinite(force2);
 318         if (force2 >= force2Tolerance || nonFinite)
 319         {
 320             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 321                     step,
 322                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 323         }
 324         if (nonFinite)
 325         {
 326             numNonFinite++;
 327         }
 328     }
 329     if (numNonFinite > 0)
 330     {
 331         /* Note that with MPI this fatal call on one rank might interrupt
 332          * the printing on other ranks. But we can only avoid that with
 333          * an expensive MPI barrier that we would need at each step.
 334          */
 335         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
 336     }
 337 }
 338
 339 static void post_process_forces(const t_commrec           *cr,
 340                                 int64_t                    step,
 341                                 t_nrnb                    *nrnb,
 342                                 gmx_wallcycle_t            wcycle,
 343                                 const gmx_localtop_t      *top,
 344                                 const matrix               box,
 345                                 const rvec                 x[],
 346                                 rvec                       f[],
 347                                 gmx::ForceWithVirial      *forceWithVirial,
 348                                 tensor                     vir_force,
 349                                 const t_mdatoms           *mdatoms,
 350                                 const t_graph             *graph,
 351                                 const t_forcerec          *fr,
 352                                 const gmx_vsite_t         *vsite,
 353                                 int                        flags)
 354 {
 355     if (fr->haveDirectVirialContributions)
 356     {
 357         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 358
 359         if (vsite)
 360         {
 361             /* Spread the mesh force on virtual sites to the other particles...
 362              * This is parallellized. MPI communication is performed
 363              * if the constructing atoms aren't local.
 364              */
 365             matrix virial = { { 0 } };
 366             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 367                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
 368                            nrnb,
 369                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 370             forceWithVirial->addVirialContribution(virial);
 371         }
 372
 373         if (flags & GMX_FORCE_VIRIAL)
 374         {
 375             /* Now add the forces, this is local */
 376             sum_forces(f, forceWithVirial->force_);
 377
 378             /* Add the direct virial contributions */
 379             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 380             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 381
 382             if (debug)
 383             {
 384                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 385             }
 386         }
 387     }
 388
 389     if (fr->print_force >= 0)
 390     {
 391         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 392     }
 393 }
 394
 395 static void do_nb_verlet(const t_forcerec *fr,
 396                          const interaction_const_t *ic,
 397                          gmx_enerdata_t *enerd,
 398                          int flags, int ilocality,
 399                          int clearF,
 400                          int64_t step,
 401                          t_nrnb *nrnb,
 402                          gmx_wallcycle_t wcycle)
 403 {
 404     if (!(flags & GMX_FORCE_NONBONDED))
 405     {
 406         /* skip non-bonded calculation */
 407         return;
 408     }
 409
 410     nonbonded_verlet_t       *nbv  = fr->nbv;
 411     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 412
 413     /* GPU kernel launch overhead is already timed separately */
 414     if (fr->cutoff_scheme != ecutsVERLET)
 415     {
 416         gmx_incons("Invalid cut-off scheme passed!");
 417     }
 418
 419     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 420
 421     if (!bUsingGpuKernels)
 422     {
 423         /* When dynamic pair-list  pruning is requested, we need to prune
 424          * at nstlistPrune steps.
 425          */
 426         if (nbv->listParams->useDynamicPruning &&
 427             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 428         {
 429             /* Prune the pair-list beyond fr->ic->rlistPrune using
 430              * the current coordinates of the atoms.
 431              */
 432             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 433             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 434             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 435         }
 436
 437         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 438     }
 439
 440     switch (nbvg->kernel_type)
 441     {
 442         case nbnxnk4x4_PlainC:
 443         case nbnxnk4xN_SIMD_4xN:
 444         case nbnxnk4xN_SIMD_2xNN:
 445             nbnxn_kernel_cpu(nbvg,
 446                              nbv->nbat,
 447                              ic,
 448                              fr->shift_vec,
 449                              flags,
 450                              clearF,
 451                              fr->fshift[0],
 452                              enerd->grpp.ener[egCOULSR],
 453                              fr->bBHAM ?
 454                              enerd->grpp.ener[egBHAMSR] :
 455                              enerd->grpp.ener[egLJSR]);
 456             break;
 457
 458         case nbnxnk8x8x8_GPU:
 459             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
 460             break;
 461
 462         case nbnxnk8x8x8_PlainC:
 463             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nblGpu[0],
 464                                  nbv->nbat, ic,
 465                                  fr->shift_vec,
 466                                  flags,
 467                                  clearF,
 468                                  nbv->nbat->out[0].f,
 469                                  fr->fshift[0],
 470                                  enerd->grpp.ener[egCOULSR],
 471                                  fr->bBHAM ?
 472                                  enerd->grpp.ener[egBHAMSR] :
 473                                  enerd->grpp.ener[egLJSR]);
 474             break;
 475
 476         default:
 477             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 478
 479     }
 480     if (!bUsingGpuKernels)
 481     {
 482         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 483     }
 484
 485     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 486     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 487     {
 488         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 489     }
 490     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 491              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 492     {
 493         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 494     }
 495     else
 496     {
 497         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 498     }
 499     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 500     if (flags & GMX_FORCE_ENERGY)
 501     {
 502         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 503         enr_nbnxn_kernel_ljc += 1;
 504         enr_nbnxn_kernel_lj  += 1;
 505     }
 506
 507     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 508              nbvg->nbl_lists.natpair_ljq);
 509     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 510              nbvg->nbl_lists.natpair_lj);
 511     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 512     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 513              nbvg->nbl_lists.natpair_q);
 514
 515     if (ic->vdw_modifier == eintmodFORCESWITCH)
 516     {
 517         /* We add up the switch cost separately */
 518         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 519                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 520     }
 521     if (ic->vdw_modifier == eintmodPOTSWITCH)
 522     {
 523         /* We add up the switch cost separately */
 524         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 525                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 526     }
 527     if (ic->vdwtype == evdwPME)
 528     {
 529         /* We add up the LJ Ewald cost separately */
 530         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 531                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 532     }
 533 }
 534
 535 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 536                              t_forcerec           *fr,
 537                              rvec                  x[],
 538                              rvec                  f[],
 539                              const t_mdatoms      *mdatoms,
 540                              t_lambda             *fepvals,
 541                              real                 *lambda,
 542                              gmx_enerdata_t       *enerd,
 543                              int                   flags,
 544                              t_nrnb               *nrnb,
 545                              gmx_wallcycle_t       wcycle)
 546 {
 547     int              donb_flags;
 548     nb_kernel_data_t kernel_data;
 549     real             lam_i[efptNR];
 550     real             dvdl_nb[efptNR];
 551     int              th;
 552     int              i, j;
 553
 554     donb_flags = 0;
 555     /* Add short-range interactions */
 556     donb_flags |= GMX_NONBONDED_DO_SR;
 557
 558     /* Currently all group scheme kernels always calculate (shift-)forces */
 559     if (flags & GMX_FORCE_FORCES)
 560     {
 561         donb_flags |= GMX_NONBONDED_DO_FORCE;
 562     }
 563     if (flags & GMX_FORCE_VIRIAL)
 564     {
 565         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 566     }
 567     if (flags & GMX_FORCE_ENERGY)
 568     {
 569         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 570     }
 571
 572     kernel_data.flags  = donb_flags;
 573     kernel_data.lambda = lambda;
 574     kernel_data.dvdl   = dvdl_nb;
 575
 576     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 577     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 578
 579     /* reset free energy components */
 580     for (i = 0; i < efptNR; i++)
 581     {
 582         dvdl_nb[i]  = 0;
 583     }
 584
 585     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 586
 587     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 588 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 589     for (th = 0; th < nbl_lists->nnbl; th++)
 590     {
 591         try
 592         {
 593             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 594                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 595         }
 596         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 597     }
 598
 599     if (fepvals->sc_alpha != 0)
 600     {
 601         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 602         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 603     }
 604     else
 605     {
 606         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 607         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 608     }
 609
 610     /* If we do foreign lambda and we have soft-core interactions
 611      * we have to recalculate the (non-linear) energies contributions.
 612      */
 613     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 614     {
 615         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 616         kernel_data.lambda         = lam_i;
 617         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 618         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 619         /* Note that we add to kernel_data.dvdl, but ignore the result */
 620
 621         for (i = 0; i < enerd->n_lambda; i++)
 622         {
 623             for (j = 0; j < efptNR; j++)
 624             {
 625                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 626             }
 627             reset_foreign_enerdata(enerd);
 628 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 629             for (th = 0; th < nbl_lists->nnbl; th++)
 630             {
 631                 try
 632                 {
 633                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 634                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 635                 }
 636                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 637             }
 638
 639             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 640             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 641         }
 642     }
 643
 644     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 645 }
 646
 647 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 648 {
 649     return nbv != nullptr && nbv->bUseGPU;
 650 }
 651
 652 static inline void clear_rvecs_omp(int n, rvec v[])
 653 {
 654     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 655
 656     /* Note that we would like to avoid this conditional by putting it
 657      * into the omp pragma instead, but then we still take the full
 658      * omp parallel for overhead (at least with gcc5).
 659      */
 660     if (nth == 1)
 661     {
 662         for (int i = 0; i < n; i++)
 663         {
 664             clear_rvec(v[i]);
 665         }
 666     }
 667     else
 668     {
 669 #pragma omp parallel for num_threads(nth) schedule(static)
 670         for (int i = 0; i < n; i++)
 671         {
 672             clear_rvec(v[i]);
 673         }
 674     }
 675 }
 676
 677 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 678  *
 679  * \param groupOptions  Group options, containing T-coupling options
 680  */
 681 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 682 {
 683     real nrdfCoupled   = 0;
 684     real nrdfUncoupled = 0;
 685     real kineticEnergy = 0;
 686     for (int g = 0; g < groupOptions.ngtc; g++)
 687     {
 688         if (groupOptions.tau_t[g] >= 0)
 689         {
 690             nrdfCoupled   += groupOptions.nrdf[g];
 691             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 692         }
 693         else
 694         {
 695             nrdfUncoupled += groupOptions.nrdf[g];
 696         }
 697     }
 698
 699     /* This conditional with > also catches nrdf=0 */
 700     if (nrdfCoupled > nrdfUncoupled)
 701     {
 702         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 703     }
 704     else
 705     {
 706         return 0;
 707     }
 708 }
 709
 710 /*! \brief This routine checks that the potential energy is finite.
 711  *
 712  * Always checks that the potential energy is finite. If step equals
 713  * inputrec.init_step also checks that the magnitude of the potential energy
 714  * is reasonable. Terminates with a fatal error when a check fails.
 715  * Note that passing this check does not guarantee finite forces,
 716  * since those use slightly different arithmetics. But in most cases
 717  * there is just a narrow coordinate range where forces are not finite
 718  * and energies are finite.
 719  *
 720  * \param[in] step      The step number, used for checking and printing
 721  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 722  * \param[in] inputrec  The input record
 723  */
 724 static void checkPotentialEnergyValidity(int64_t               step,
 725                                          const gmx_enerdata_t &enerd,
 726                                          const t_inputrec     &inputrec)
 727 {
 728     /* Threshold valid for comparing absolute potential energy against
 729      * the kinetic energy. Normally one should not consider absolute
 730      * potential energy values, but with a factor of one million
 731      * we should never get false positives.
 732      */
 733     constexpr real c_thresholdFactor = 1e6;
 734
 735     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 736     real           averageKineticEnergy = 0;
 737     /* We only check for large potential energy at the initial step,
 738      * because that is by far the most likely step for this too occur
 739      * and because computing the average kinetic energy is not free.
 740      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 741      * before they become NaN.
 742      */
 743     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 744     {
 745         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 746     }
 747
 748     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 749                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 750     {
 751         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 752                   step,
 753                   enerd.term[F_EPOT],
 754                   energyIsNotFinite ? "not finite" : "extremely high",
 755                   enerd.term[F_LJ],
 756                   enerd.term[F_COUL_SR],
 757                   energyIsNotFinite ? "non-finite" : "very high",
 758                   energyIsNotFinite ? " or Nan" : "");
 759     }
 760 }
 761
 762 /*! \brief Compute forces and/or energies for special algorithms
 763  *
 764  * The intention is to collect all calls to algorithms that compute
 765  * forces on local atoms only and that do not contribute to the local
 766  * virial sum (but add their virial contribution separately).
 767  * Eventually these should likely all become ForceProviders.
 768  * Within this function the intention is to have algorithms that do
 769  * global communication at the end, so global barriers within the MD loop
 770  * are as close together as possible.
 771  *
 772  * \param[in]     fplog            The log file
 773  * \param[in]     cr               The communication record
 774  * \param[in]     inputrec         The input record
 775  * \param[in]     awh              The Awh module (nullptr if none in use).
 776  * \param[in]     enforcedRotation Enforced rotation module.
 777  * \param[in]     step             The current MD step
 778  * \param[in]     t                The current time
 779  * \param[in,out] wcycle           Wallcycle accounting struct
 780  * \param[in,out] forceProviders   Pointer to a list of force providers
 781  * \param[in]     box              The unit cell
 782  * \param[in]     x                The coordinates
 783  * \param[in]     mdatoms          Per atom properties
 784  * \param[in]     lambda           Array of free-energy lambda values
 785  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 786  * \param[in,out] forceWithVirial  Force and virial buffers
 787  * \param[in,out] enerd            Energy buffer
 788  * \param[in,out] ed               Essential dynamics pointer
 789  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 790  *
 791  * \todo Remove bNS, which is used incorrectly.
 792  * \todo Convert all other algorithms called here to ForceProviders.
 793  */
 794 static void
 795 computeSpecialForces(FILE                          *fplog,
 796                      const t_commrec               *cr,
 797                      const t_inputrec              *inputrec,
 798                      gmx::Awh                      *awh,
 799                      gmx_enfrot                    *enforcedRotation,
 800                      int64_t                        step,
 801                      double                         t,
 802                      gmx_wallcycle_t                wcycle,
 803                      ForceProviders                *forceProviders,
 804                      matrix                         box,
 805                      gmx::ArrayRef<const gmx::RVec> x,
 806                      const t_mdatoms               *mdatoms,
 807                      real                          *lambda,
 808                      int                            forceFlags,
 809                      gmx::ForceWithVirial          *forceWithVirial,
 810                      gmx_enerdata_t                *enerd,
 811                      gmx_edsam                     *ed,
 812                      gmx_bool                       bNS)
 813 {
 814     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
 815
 816     /* NOTE: Currently all ForceProviders only provide forces.
 817      *       When they also provide energies, remove this conditional.
 818      */
 819     if (computeForces)
 820     {
 821         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 822         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 823
 824         /* Collect forces from modules */
 825         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 826     }
 827
 828     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 829     {
 830         pull_potential_wrapper(cr, inputrec, box, x,
 831                                forceWithVirial,
 832                                mdatoms, enerd, lambda, t,
 833                                wcycle);
 834
 835         if (awh)
 836         {
 837             enerd->term[F_COM_PULL] +=
 838                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 839                                                   forceWithVirial,
 840                                                   t, step, wcycle, fplog);
 841         }
 842     }
 843
 844     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 845
 846     /* Add the forces from enforced rotation potentials (if any) */
 847     if (inputrec->bRot)
 848     {
 849         wallcycle_start(wcycle, ewcROTadd);
 850         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
 851         wallcycle_stop(wcycle, ewcROTadd);
 852     }
 853
 854     if (ed)
 855     {
 856         /* Note that since init_edsam() is called after the initialization
 857          * of forcerec, edsam doesn't request the noVirSum force buffer.
 858          * Thus if no other algorithm (e.g. PME) requires it, the forces
 859          * here will contribute to the virial.
 860          */
 861         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 862     }
 863
 864     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 865     if (inputrec->bIMD && computeForces)
 866     {
 867         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 868     }
 869 }
 870
 871 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 872  *
 873  * \param[in]  pmedata       The PME structure
 874  * \param[in]  box           The box matrix
 875  * \param[in]  x             Coordinate array
 876  * \param[in]  flags         Force flags
 877  * \param[in]  pmeFlags      PME flags
 878  * \param[in]  wcycle        The wallcycle structure
 879  */
 880 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 881                                       matrix          box,
 882                                       rvec            x[],
 883                                       int             flags,
 884                                       int             pmeFlags,
 885                                       gmx_wallcycle_t wcycle)
 886 {
 887     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
 888     pme_gpu_launch_spread(pmedata, x, wcycle);
 889 }
 890
 891 /*! \brief Launch the FFT and gather stages of PME GPU
 892  *
 893  * This function only implements setting the output forces (no accumulation).
 894  *
 895  * \param[in]  pmedata        The PME structure
 896  * \param[in]  wcycle         The wallcycle structure
 897  */
 898 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 899                                      gmx_wallcycle_t   wcycle)
 900 {
 901     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 902     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 903 }
 904
 905 /*! \brief
 906  *  Polling wait for either of the PME or nonbonded GPU tasks.
 907  *
 908  * Instead of a static order in waiting for GPU tasks, this function
 909  * polls checking which of the two tasks completes first, and does the
 910  * associated force buffer reduction overlapped with the other task.
 911  * By doing that, unlike static scheduling order, it can always overlap
 912  * one of the reductions, regardless of the GPU task completion order.
 913  *
 914  * \param[in]     nbv              Nonbonded verlet structure
 915  * \param[in,out] pmedata          PME module data
 916  * \param[in,out] force            Force array to reduce task outputs into.
 917  * \param[in,out] forceWithVirial  Force and virial buffers
 918  * \param[in,out] fshift           Shift force output vector results are reduced into
 919  * \param[in,out] enerd            Energy data structure results are reduced into
 920  * \param[in]     flags            Force flags
 921  * \param[in]     pmeFlags         PME flags
 922  * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
 923  * \param[in]     wcycle           The wallcycle structure
 924  */
 925 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
 926                                         gmx_pme_t                           *pmedata,
 927                                         gmx::ArrayRefWithPadding<gmx::RVec> *force,
 928                                         gmx::ForceWithVirial                *forceWithVirial,
 929                                         rvec                                 fshift[],
 930                                         gmx_enerdata_t                      *enerd,
 931                                         int                                  flags,
 932                                         int                                  pmeFlags,
 933                                         bool                                 haveOtherWork,
 934                                         gmx_wallcycle_t                      wcycle)
 935 {
 936     bool isPmeGpuDone = false;
 937     bool isNbGpuDone  = false;
 938
 939
 940     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 941
 942     while (!isPmeGpuDone || !isNbGpuDone)
 943     {
 944         if (!isPmeGpuDone)
 945         {
 946             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 947             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
 948         }
 949
 950         if (!isNbGpuDone)
 951         {
 952             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 953             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 954             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 955                                                     flags, eatLocal,
 956                                                     haveOtherWork,
 957                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 958                                                     fshift, completionType);
 959             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 960             // To get the call count right, when the task finished we
 961             // issue a start/stop.
 962             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 963             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 964             if (isNbGpuDone)
 965             {
 966                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 967                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 968
 969                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
 970                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
 971             }
 972         }
 973     }
 974 }
 975
 976 /*! \brief
 977  *  Launch the dynamic rolling pruning GPU task.
 978  *
 979  *  We currently alternate local/non-local list pruning in odd-even steps
 980  *  (only pruning every second step without DD).
 981  *
 982  * \param[in]     cr               The communication record
 983  * \param[in]     nbv              Nonbonded verlet structure
 984  * \param[in]     inputrec         The input record
 985  * \param[in]     step             The current MD step
 986  */
 987 static inline void launchGpuRollingPruning(const t_commrec          *cr,
 988                                            const nonbonded_verlet_t *nbv,
 989                                            const t_inputrec         *inputrec,
 990                                            const int64_t             step)
 991 {
 992     /* We should not launch the rolling pruning kernel at a search
 993      * step or just before search steps, since that's useless.
 994      * Without domain decomposition we prune at even steps.
 995      * With domain decomposition we alternate local and non-local
 996      * pruning at even and odd steps.
 997      */
 998     int  numRollingParts     = nbv->listParams->numRollingParts;
 999     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1000     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1001     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1002     if (stepWithCurrentList > 0 &&
1003         stepWithCurrentList < inputrec->nstlist - 1 &&
1004         (stepIsEven || DOMAINDECOMP(cr)))
1005     {
1006         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1007                                           stepIsEven ? eintLocal : eintNonlocal,
1008                                           numRollingParts);
1009     }
1010 }
1011
1012 static void do_force_cutsVERLET(FILE *fplog,
1013                                 const t_commrec *cr,
1014                                 const gmx_multisim_t *ms,
1015                                 const t_inputrec *inputrec,
1016                                 gmx::Awh *awh,
1017                                 gmx_enfrot *enforcedRotation,
1018                                 int64_t step,
1019                                 t_nrnb *nrnb,
1020                                 gmx_wallcycle_t wcycle,
1021                                 const gmx_localtop_t *top,
1022                                 const gmx_groups_t * /* groups */,
1023                                 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1024                                 history_t *hist,
1025                                 gmx::ArrayRefWithPadding<gmx::RVec> force,
1026                                 tensor vir_force,
1027                                 const t_mdatoms *mdatoms,
1028                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1029                                 real *lambda,
1030                                 t_graph *graph,
1031                                 t_forcerec *fr,
1032                                 gmx::PpForceWorkload *ppForceWorkload,
1033                                 interaction_const_t *ic,
1034                                 const gmx_vsite_t *vsite,
1035                                 rvec mu_tot,
1036                                 double t,
1037                                 gmx_edsam *ed,
1038                                 const int flags,
1039                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1040                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1041 {
1042     int                 cg1, i, j;
1043     double              mu[2*DIM];
1044     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1045     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1046     rvec                vzero, box_diag;
1047     float               cycles_pme, cycles_wait_gpu;
1048     nonbonded_verlet_t *nbv = fr->nbv;
1049
1050     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1051     bNS           = ((flags & GMX_FORCE_NS) != 0);
1052     bFillGrid     = (bNS && bStateChanged);
1053     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1054     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
1055     bUseGPU       = fr->nbv->bUseGPU;
1056     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1057
1058     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1059     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1060     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1061         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1062     const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
1063         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
1064         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
1065         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
1066
1067     /* At a search step we need to start the first balancing region
1068      * somewhere early inside the step after communication during domain
1069      * decomposition (and not during the previous step as usual).
1070      */
1071     if (bNS &&
1072         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1073     {
1074         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1075     }
1076
1077     cycles_wait_gpu = 0;
1078
1079     const int start  = 0;
1080     const int homenr = mdatoms->homenr;
1081
1082     clear_mat(vir_force);
1083
1084     if (DOMAINDECOMP(cr))
1085     {
1086         cg1 = cr->dd->globalAtomGroupIndices.size();
1087     }
1088     else
1089     {
1090         cg1 = top->cgs.nr;
1091     }
1092     if (fr->n_tpi > 0)
1093     {
1094         cg1--;
1095     }
1096
1097     if (bStateChanged)
1098     {
1099         update_forcerec(fr, box);
1100
1101         if (inputrecNeedMutot(inputrec))
1102         {
1103             /* Calculate total (local) dipole moment in a temporary common array.
1104              * This makes it possible to sum them over nodes faster.
1105              */
1106             calc_mu(start, homenr,
1107                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1108                     mu, mu+DIM);
1109         }
1110     }
1111
1112     if (fr->ePBC != epbcNONE)
1113     {
1114         /* Compute shift vectors every step,
1115          * because of pressure coupling or box deformation!
1116          */
1117         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1118         {
1119             calc_shifts(box, fr->shift_vec);
1120         }
1121
1122         if (bCalcCGCM)
1123         {
1124             put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1125             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1126         }
1127         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1128         {
1129             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1130         }
1131     }
1132
1133     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1134                                  fr->shift_vec, nbv->nbat);
1135
1136 #if GMX_MPI
1137     if (!thisRankHasDuty(cr, DUTY_PME))
1138     {
1139         /* Send particle coordinates to the pme nodes.
1140          * Since this is only implemented for domain decomposition
1141          * and domain decomposition does not use the graph,
1142          * we do not need to worry about shifting.
1143          */
1144         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1145                                  lambda[efptCOUL], lambda[efptVDW],
1146                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1147                                  step, wcycle);
1148     }
1149 #endif /* GMX_MPI */
1150
1151     if (useGpuPme)
1152     {
1153         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1154     }
1155
1156     /* do gridding for pair search */
1157     if (bNS)
1158     {
1159         if (graph && bStateChanged)
1160         {
1161             /* Calculate intramolecular shift vectors to make molecules whole */
1162             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1163         }
1164
1165         clear_rvec(vzero);
1166         box_diag[XX] = box[XX][XX];
1167         box_diag[YY] = box[YY][YY];
1168         box_diag[ZZ] = box[ZZ][ZZ];
1169
1170         wallcycle_start(wcycle, ewcNS);
1171         if (!DOMAINDECOMP(cr))
1172         {
1173             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1174             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1175                               0, vzero, box_diag,
1176                               nullptr, 0, mdatoms->homenr, -1,
1177                               fr->cginfo, x.unpaddedArrayRef(),
1178                               0, nullptr,
1179                               nbv->grp[eintLocal].kernel_type,
1180                               nbv->nbat);
1181             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1182         }
1183         else
1184         {
1185             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1186             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1187                                        fr->cginfo, x.unpaddedArrayRef(),
1188                                        nbv->grp[eintNonlocal].kernel_type,
1189                                        nbv->nbat);
1190             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1191         }
1192
1193         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1194
1195         wallcycle_stop(wcycle, ewcNS);
1196     }
1197
1198     /* initialize the GPU atom data and copy shift vector */
1199     if (bUseGPU)
1200     {
1201         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1202         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1203
1204         if (bNS)
1205         {
1206             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1207         }
1208
1209         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1210
1211         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1212
1213         if (bNS && fr->gpuBonded)
1214         {
1215             /* Now we put all atoms on the grid, we can assign bonded
1216              * interactions to the GPU, where the grid order is
1217              * needed. Also the xq, f and fshift device buffers have
1218              * been reallocated if needed, so the bonded code can
1219              * learn about them. */
1220             // TODO the xq, f, and fshift buffers are now shared
1221             // resources, so they should be maintained by a
1222             // higher-level object than the nb module.
1223             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1224                                                                   top->idef,
1225                                                                   nbnxn_gpu_get_xq(nbv->gpu_nbv),
1226                                                                   nbnxn_gpu_get_f(nbv->gpu_nbv),
1227                                                                   nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1228             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1229         }
1230
1231         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1232     }
1233
1234     /* do local pair search */
1235     if (bNS)
1236     {
1237         wallcycle_start_nocount(wcycle, ewcNS);
1238         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1239         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1240                             &top->excls,
1241                             nbv->listParams->rlistOuter,
1242                             nbv->min_ci_balanced,
1243                             &nbv->grp[eintLocal].nbl_lists,
1244                             eintLocal,
1245                             nbv->grp[eintLocal].kernel_type,
1246                             nrnb);
1247         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1248         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1249         {
1250             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1251         }
1252         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1253
1254         if (bUseGPU)
1255         {
1256             /* initialize local pair-list on the GPU */
1257             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1258                                     nbv->grp[eintLocal].nbl_lists.nblGpu[0],
1259                                     eintLocal);
1260         }
1261         wallcycle_stop(wcycle, ewcNS);
1262     }
1263     else
1264     {
1265         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1266                                         nbv->nbat, wcycle);
1267     }
1268
1269     if (bUseGPU)
1270     {
1271         if (DOMAINDECOMP(cr))
1272         {
1273             ddOpenBalanceRegionGpu(cr->dd);
1274         }
1275
1276         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1277
1278         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1279         nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
1280         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1281
1282         // bonded work not split into separate local and non-local, so with DD
1283         // we can only launch the kernel after non-local coordinates have been received.
1284         if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1285         {
1286             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1287             fr->gpuBonded->launchKernels(fr, flags, box);
1288             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1289         }
1290
1291         /* launch local nonbonded work on GPU */
1292         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1294                      step, nrnb, wcycle);
1295         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1296         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1297     }
1298
1299     if (useGpuPme)
1300     {
1301         // In PME GPU and mixed mode we launch FFT / gather after the
1302         // X copy/transform to allow overlap as well as after the GPU NB
1303         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1304         // the nonbonded kernel.
1305         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1306     }
1307
1308     /* Communicate coordinates and sum dipole if necessary +
1309        do non-local pair search */
1310     if (DOMAINDECOMP(cr))
1311     {
1312         if (bNS)
1313         {
1314             wallcycle_start_nocount(wcycle, ewcNS);
1315             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1316
1317             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1318                                 &top->excls,
1319                                 nbv->listParams->rlistOuter,
1320                                 nbv->min_ci_balanced,
1321                                 &nbv->grp[eintNonlocal].nbl_lists,
1322                                 eintNonlocal,
1323                                 nbv->grp[eintNonlocal].kernel_type,
1324                                 nrnb);
1325             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1326             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1327             {
1328                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1329             }
1330             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1331
1332             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1333             {
1334                 /* initialize non-local pair-list on the GPU */
1335                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1336                                         nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
1337                                         eintNonlocal);
1338             }
1339             wallcycle_stop(wcycle, ewcNS);
1340         }
1341         else
1342         {
1343             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1344
1345             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1346                                             nbv->nbat, wcycle);
1347         }
1348
1349         if (bUseGPU)
1350         {
1351             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1352
1353             /* launch non-local nonbonded tasks on GPU */
1354             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1355             nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1356             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1357
1358             if (ppForceWorkload->haveGpuBondedWork)
1359             {
1360                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1361                 fr->gpuBonded->launchKernels(fr, flags, box);
1362                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1363             }
1364
1365             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1366             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1367                          step, nrnb, wcycle);
1368             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1369
1370             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1371         }
1372     }
1373
1374     if (bUseGPU)
1375     {
1376         /* launch D2H copy-back F */
1377         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1378         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1379         if (DOMAINDECOMP(cr))
1380         {
1381             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1382                                      flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1383         }
1384         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1385                                  flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1386         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1387
1388         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1389         {
1390             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1391             fr->gpuBonded->launchEnergyTransfer();
1392             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1393         }
1394         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1395     }
1396
1397     if (bStateChanged && inputrecNeedMutot(inputrec))
1398     {
1399         if (PAR(cr))
1400         {
1401             gmx_sumd(2*DIM, mu, cr);
1402             ddReopenBalanceRegionCpu(cr->dd);
1403         }
1404
1405         for (i = 0; i < 2; i++)
1406         {
1407             for (j = 0; j < DIM; j++)
1408             {
1409                 fr->mu_tot[i][j] = mu[i*DIM + j];
1410             }
1411         }
1412     }
1413     if (fr->efep == efepNO)
1414     {
1415         copy_rvec(fr->mu_tot[0], mu_tot);
1416     }
1417     else
1418     {
1419         for (j = 0; j < DIM; j++)
1420         {
1421             mu_tot[j] =
1422                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1423                 lambda[efptCOUL]*fr->mu_tot[1][j];
1424         }
1425     }
1426
1427     /* Reset energies */
1428     reset_enerdata(enerd);
1429     clear_rvecs(SHIFTS, fr->fshift);
1430
1431     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1432     {
1433         wallcycle_start(wcycle, ewcPPDURINGPME);
1434         dd_force_flop_start(cr->dd, nrnb);
1435     }
1436
1437     if (inputrec->bRot)
1438     {
1439         wallcycle_start(wcycle, ewcROT);
1440         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1441         wallcycle_stop(wcycle, ewcROT);
1442     }
1443
1444     /* Temporary solution until all routines take PaddedRVecVector */
1445     rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1446
1447     /* Start the force cycle counter.
1448      * Note that a different counter is used for dynamic load balancing.
1449      */
1450     wallcycle_start(wcycle, ewcFORCE);
1451
1452     gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1453     if (bDoForces)
1454     {
1455         /* If we need to compute the virial, we might need a separate
1456          * force buffer for algorithms for which the virial is calculated
1457          * directly, such as PME.
1458          */
1459         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1460         {
1461             forceRef = *fr->forceBufferForDirectVirialContributions;
1462
1463             /* TODO: update comment
1464              * We only compute forces on local atoms. Note that vsites can
1465              * spread to non-local atoms, but that part of the buffer is
1466              * cleared separately in the vsite spreading code.
1467              */
1468             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1469         }
1470         /* Clear the short- and long-range forces */
1471         clear_rvecs_omp(fr->natoms_force_constr, f);
1472     }
1473
1474     /* forceWithVirial uses the local atom range only */
1475     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1476
1477     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1478     {
1479         clear_pull_forces(inputrec->pull_work);
1480     }
1481
1482     /* We calculate the non-bonded forces, when done on the CPU, here.
1483      * We do this before calling do_force_lowlevel, because in that
1484      * function, the listed forces are calculated before PME, which
1485      * does communication.  With this order, non-bonded and listed
1486      * force calculation imbalance can be balanced out by the domain
1487      * decomposition load balancing.
1488      */
1489
1490     if (!bUseOrEmulGPU)
1491     {
1492         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1493                      step, nrnb, wcycle);
1494     }
1495
1496     if (fr->efep != efepNO)
1497     {
1498         /* Calculate the local and non-local free energy interactions here.
1499          * Happens here on the CPU both with and without GPU.
1500          */
1501         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1502         {
1503             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1504                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1505                              inputrec->fepvals, lambda,
1506                              enerd, flags, nrnb, wcycle);
1507         }
1508
1509         if (DOMAINDECOMP(cr) &&
1510             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1511         {
1512             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1513                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1514                              inputrec->fepvals, lambda,
1515                              enerd, flags, nrnb, wcycle);
1516         }
1517     }
1518
1519     if (!bUseOrEmulGPU)
1520     {
1521         int aloc;
1522
1523         if (DOMAINDECOMP(cr))
1524         {
1525             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1526                          step, nrnb, wcycle);
1527         }
1528
1529         if (!bUseOrEmulGPU)
1530         {
1531             aloc = eintLocal;
1532         }
1533         else
1534         {
1535             aloc = eintNonlocal;
1536         }
1537
1538         /* Add all the non-bonded force to the normal force array.
1539          * This can be split into a local and a non-local part when overlapping
1540          * communication with calculation with domain decomposition.
1541          */
1542         wallcycle_stop(wcycle, ewcFORCE);
1543
1544         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1545
1546         wallcycle_start_nocount(wcycle, ewcFORCE);
1547
1548         /* if there are multiple fshift output buffers reduce them */
1549         if ((flags & GMX_FORCE_VIRIAL) &&
1550             nbv->grp[aloc].nbl_lists.nnbl > 1)
1551         {
1552             /* This is not in a subcounter because it takes a
1553                negligible and constant-sized amount of time */
1554             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1555                                                      fr->fshift);
1556         }
1557     }
1558
1559     /* update QMMMrec, if necessary */
1560     if (fr->bQMMM)
1561     {
1562         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1563     }
1564
1565     /* Compute the bonded and non-bonded energies and optionally forces */
1566     do_force_lowlevel(fr, inputrec, &(top->idef),
1567                       cr, ms, nrnb, wcycle, mdatoms,
1568                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1569                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1570                       flags, &cycles_pme);
1571
1572     wallcycle_stop(wcycle, ewcFORCE);
1573
1574     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1575                          step, t, wcycle,
1576                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1577                          flags, &forceWithVirial, enerd,
1578                          ed, bNS);
1579
1580     if (bUseOrEmulGPU)
1581     {
1582         /* wait for non-local forces (or calculate in emulation mode) */
1583         if (DOMAINDECOMP(cr))
1584         {
1585             if (bUseGPU)
1586             {
1587                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1588                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1589                                            flags, eatNonlocal,
1590                                            ppForceWorkload->haveGpuBondedWork,
1591                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1592                                            fr->fshift);
1593                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1594             }
1595             else
1596             {
1597                 wallcycle_start_nocount(wcycle, ewcFORCE);
1598                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1599                              step, nrnb, wcycle);
1600                 wallcycle_stop(wcycle, ewcFORCE);
1601             }
1602
1603             /* skip the reduction if there was no non-local work to do */
1604             if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
1605             {
1606                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1607                                                nbv->nbat, f, wcycle);
1608             }
1609         }
1610     }
1611
1612     if (DOMAINDECOMP(cr))
1613     {
1614         /* We are done with the CPU compute.
1615          * We will now communicate the non-local forces.
1616          * If we use a GPU this will overlap with GPU work, so in that case
1617          * we do not close the DD force balancing region here.
1618          */
1619         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1620         {
1621             ddCloseBalanceRegionCpu(cr->dd);
1622         }
1623         if (bDoForces)
1624         {
1625             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1626         }
1627     }
1628
1629     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1630     // an alternating wait/reduction scheme.
1631     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1632     if (alternateGpuWait)
1633     {
1634         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1635     }
1636
1637     if (!alternateGpuWait && useGpuPme)
1638     {
1639         pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1640     }
1641
1642     /* Wait for local GPU NB outputs on the non-alternating wait path */
1643     if (!alternateGpuWait && bUseGPU)
1644     {
1645         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1646          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1647          * but even with a step of 0.1 ms the difference is less than 1%
1648          * of the step time.
1649          */
1650         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1651
1652         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1653         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1654                                    flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1655                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1656                                    fr->fshift);
1657         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1658
1659         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1660         {
1661             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1662             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1663             {
1664                 /* We measured few cycles, it could be that the kernel
1665                  * and transfer finished earlier and there was no actual
1666                  * wait time, only API call overhead.
1667                  * Then the actual time could be anywhere between 0 and
1668                  * cycles_wait_est. We will use half of cycles_wait_est.
1669                  */
1670                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1671             }
1672             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1673         }
1674     }
1675
1676     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1677     {
1678         // NOTE: emulation kernel is not included in the balancing region,
1679         // but emulation mode does not target performance anyway
1680         wallcycle_start_nocount(wcycle, ewcFORCE);
1681         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1682                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1683                      step, nrnb, wcycle);
1684         wallcycle_stop(wcycle, ewcFORCE);
1685     }
1686
1687     if (useGpuPme)
1688     {
1689         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1690     }
1691
1692     if (bUseGPU)
1693     {
1694         /* now clear the GPU outputs while we finish the step on the CPU */
1695         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1696         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1697         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1698
1699         /* Is dynamic pair-list pruning activated? */
1700         if (nbv->listParams->useDynamicPruning)
1701         {
1702             launchGpuRollingPruning(cr, nbv, inputrec, step);
1703         }
1704         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1705         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1706     }
1707
1708     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1709     {
1710         wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1711         // in principle this should be included in the DD balancing region,
1712         // but generally it is infrequent so we'll omit it for the sake of
1713         // simpler code
1714         fr->gpuBonded->accumulateEnergyTerms(enerd);
1715         wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1716
1717         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1718         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1719         fr->gpuBonded->clearEnergies();
1720         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1721         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1722     }
1723
1724     /* Do the nonbonded GPU (or emulation) force buffer reduction
1725      * on the non-alternating path. */
1726     if (bUseOrEmulGPU && !alternateGpuWait)
1727     {
1728         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1729                                        nbv->nbat, f, wcycle);
1730     }
1731     if (DOMAINDECOMP(cr))
1732     {
1733         dd_force_flop_stop(cr->dd, nrnb);
1734     }
1735
1736     if (bDoForces)
1737     {
1738         /* If we have NoVirSum forces, but we do not calculate the virial,
1739          * we sum fr->f_novirsum=f later.
1740          */
1741         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1742         {
1743             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1744                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1745         }
1746
1747         if (flags & GMX_FORCE_VIRIAL)
1748         {
1749             /* Calculation of the virial must be done after vsites! */
1750             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1751                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1752         }
1753     }
1754
1755     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1756     {
1757         /* In case of node-splitting, the PP nodes receive the long-range
1758          * forces, virial and energy from the PME nodes here.
1759          */
1760         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1761     }
1762
1763     if (bDoForces)
1764     {
1765         post_process_forces(cr, step, nrnb, wcycle,
1766                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1767                             vir_force, mdatoms, graph, fr, vsite,
1768                             flags);
1769     }
1770
1771     if (flags & GMX_FORCE_ENERGY)
1772     {
1773         /* Sum the potential energy terms from group contributions */
1774         sum_epot(&(enerd->grpp), enerd->term);
1775
1776         if (!EI_TPI(inputrec->eI))
1777         {
1778             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1779         }
1780     }
1781 }
1782
1783 static void do_force_cutsGROUP(FILE *fplog,
1784                                const t_commrec *cr,
1785                                const gmx_multisim_t *ms,
1786                                const t_inputrec *inputrec,
1787                                gmx::Awh *awh,
1788                                gmx_enfrot *enforcedRotation,
1789                                int64_t step,
1790                                t_nrnb *nrnb,
1791                                gmx_wallcycle_t wcycle,
1792                                gmx_localtop_t *top,
1793                                const gmx_groups_t *groups,
1794                                matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1795                                history_t *hist,
1796                                gmx::ArrayRefWithPadding<gmx::RVec> force,
1797                                tensor vir_force,
1798                                const t_mdatoms *mdatoms,
1799                                gmx_enerdata_t *enerd,
1800                                t_fcdata *fcd,
1801                                real *lambda,
1802                                t_graph *graph,
1803                                t_forcerec *fr,
1804                                const gmx_vsite_t *vsite,
1805                                rvec mu_tot,
1806                                double t,
1807                                gmx_edsam *ed,
1808                                int flags,
1809                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1810                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1811 {
1812     int        cg0, cg1, i, j;
1813     double     mu[2*DIM];
1814     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1815     gmx_bool   bDoForces;
1816     float      cycles_pme;
1817
1818     const int  start  = 0;
1819     const int  homenr = mdatoms->homenr;
1820
1821     clear_mat(vir_force);
1822
1823     cg0 = 0;
1824     if (DOMAINDECOMP(cr))
1825     {
1826         cg1 = cr->dd->globalAtomGroupIndices.size();
1827     }
1828     else
1829     {
1830         cg1 = top->cgs.nr;
1831     }
1832     if (fr->n_tpi > 0)
1833     {
1834         cg1--;
1835     }
1836
1837     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1838     bNS            = ((flags & GMX_FORCE_NS) != 0);
1839     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1840     bFillGrid      = (bNS && bStateChanged);
1841     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1842     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1843
1844     if (bStateChanged)
1845     {
1846         update_forcerec(fr, box);
1847
1848         if (inputrecNeedMutot(inputrec))
1849         {
1850             /* Calculate total (local) dipole moment in a temporary common array.
1851              * This makes it possible to sum them over nodes faster.
1852              */
1853             calc_mu(start, homenr,
1854                     x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1855                     mu, mu+DIM);
1856         }
1857     }
1858
1859     if (fr->ePBC != epbcNONE)
1860     {
1861         /* Compute shift vectors every step,
1862          * because of pressure coupling or box deformation!
1863          */
1864         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1865         {
1866             calc_shifts(box, fr->shift_vec);
1867         }
1868
1869         if (bCalcCGCM)
1870         {
1871             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1872                                      &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1873             inc_nrnb(nrnb, eNR_CGCM, homenr);
1874             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1875         }
1876         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1877         {
1878             unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1879         }
1880     }
1881     else if (bCalcCGCM)
1882     {
1883         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1884         inc_nrnb(nrnb, eNR_CGCM, homenr);
1885     }
1886
1887     if (bCalcCGCM && gmx_debug_at)
1888     {
1889         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1890     }
1891
1892 #if GMX_MPI
1893     if (!thisRankHasDuty(cr, DUTY_PME))
1894     {
1895         /* Send particle coordinates to the pme nodes.
1896          * Since this is only implemented for domain decomposition
1897          * and domain decomposition does not use the graph,
1898          * we do not need to worry about shifting.
1899          */
1900         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1901                                  lambda[efptCOUL], lambda[efptVDW],
1902                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1903                                  step, wcycle);
1904     }
1905 #endif /* GMX_MPI */
1906
1907     /* Communicate coordinates and sum dipole if necessary */
1908     if (DOMAINDECOMP(cr))
1909     {
1910         dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1911
1912         /* No GPU support, no move_x overlap, so reopen the balance region here */
1913         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1914         {
1915             ddReopenBalanceRegionCpu(cr->dd);
1916         }
1917     }
1918
1919     if (inputrecNeedMutot(inputrec))
1920     {
1921         if (bStateChanged)
1922         {
1923             if (PAR(cr))
1924             {
1925                 gmx_sumd(2*DIM, mu, cr);
1926                 ddReopenBalanceRegionCpu(cr->dd);
1927             }
1928             for (i = 0; i < 2; i++)
1929             {
1930                 for (j = 0; j < DIM; j++)
1931                 {
1932                     fr->mu_tot[i][j] = mu[i*DIM + j];
1933                 }
1934             }
1935         }
1936         if (fr->efep == efepNO)
1937         {
1938             copy_rvec(fr->mu_tot[0], mu_tot);
1939         }
1940         else
1941         {
1942             for (j = 0; j < DIM; j++)
1943             {
1944                 mu_tot[j] =
1945                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1946             }
1947         }
1948     }
1949
1950     /* Reset energies */
1951     reset_enerdata(enerd);
1952     clear_rvecs(SHIFTS, fr->fshift);
1953
1954     if (bNS)
1955     {
1956         wallcycle_start(wcycle, ewcNS);
1957
1958         if (graph && bStateChanged)
1959         {
1960             /* Calculate intramolecular shift vectors to make molecules whole */
1961             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1962         }
1963
1964         /* Do the actual neighbour searching */
1965         ns(fplog, fr, box,
1966            groups, top, mdatoms,
1967            cr, nrnb, bFillGrid);
1968
1969         wallcycle_stop(wcycle, ewcNS);
1970     }
1971
1972     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1973     {
1974         wallcycle_start(wcycle, ewcPPDURINGPME);
1975         dd_force_flop_start(cr->dd, nrnb);
1976     }
1977
1978     if (inputrec->bRot)
1979     {
1980         wallcycle_start(wcycle, ewcROT);
1981         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1982         wallcycle_stop(wcycle, ewcROT);
1983     }
1984
1985     /* Temporary solution until all routines take PaddedRVecVector */
1986     rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1987
1988     /* Start the force cycle counter.
1989      * Note that a different counter is used for dynamic load balancing.
1990      */
1991     wallcycle_start(wcycle, ewcFORCE);
1992
1993     gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1994     if (bDoForces)
1995     {
1996         /* If we need to compute the virial, we might need a separate
1997          * force buffer for algorithms for which the virial is calculated
1998          * separately, such as PME.
1999          */
2000         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
2001         {
2002             forceRef = *fr->forceBufferForDirectVirialContributions;
2003
2004             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
2005         }
2006
2007         /* Clear the short- and long-range forces */
2008         clear_rvecs(fr->natoms_force_constr, f);
2009     }
2010
2011     /* forceWithVirial might need the full force atom range */
2012     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
2013
2014     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
2015     {
2016         clear_pull_forces(inputrec->pull_work);
2017     }
2018
2019     /* update QMMMrec, if necessary */
2020     if (fr->bQMMM)
2021     {
2022         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
2023     }
2024
2025     /* Compute the bonded and non-bonded energies and optionally forces */
2026     do_force_lowlevel(fr, inputrec, &(top->idef),
2027                       cr, ms, nrnb, wcycle, mdatoms,
2028                       as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
2029                       box, inputrec->fepvals, lambda,
2030                       graph, &(top->excls), fr->mu_tot,
2031                       flags,
2032                       &cycles_pme);
2033
2034     wallcycle_stop(wcycle, ewcFORCE);
2035
2036     if (DOMAINDECOMP(cr))
2037     {
2038         dd_force_flop_stop(cr->dd, nrnb);
2039
2040         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2041         {
2042             ddCloseBalanceRegionCpu(cr->dd);
2043         }
2044     }
2045
2046     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2047                          step, t, wcycle,
2048                          fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
2049                          flags, &forceWithVirial, enerd,
2050                          ed, bNS);
2051
2052     if (bDoForces)
2053     {
2054         /* Communicate the forces */
2055         if (DOMAINDECOMP(cr))
2056         {
2057             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
2058             /* Do we need to communicate the separate force array
2059              * for terms that do not contribute to the single sum virial?
2060              * Position restraints and electric fields do not introduce
2061              * inter-cg forces, only full electrostatics methods do.
2062              * When we do not calculate the virial, fr->f_novirsum = f,
2063              * so we have already communicated these forces.
2064              */
2065             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2066                 (flags & GMX_FORCE_VIRIAL))
2067             {
2068                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2069             }
2070         }
2071
2072         /* If we have NoVirSum forces, but we do not calculate the virial,
2073          * we sum fr->f_novirsum=f later.
2074          */
2075         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2076         {
2077             spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
2078                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2079         }
2080
2081         if (flags & GMX_FORCE_VIRIAL)
2082         {
2083             /* Calculation of the virial must be done after vsites! */
2084             calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
2085                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2086         }
2087     }
2088
2089     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2090     {
2091         /* In case of node-splitting, the PP nodes receive the long-range
2092          * forces, virial and energy from the PME nodes here.
2093          */
2094         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2095     }
2096
2097     if (bDoForces)
2098     {
2099         post_process_forces(cr, step, nrnb, wcycle,
2100                             top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2101                             vir_force, mdatoms, graph, fr, vsite,
2102                             flags);
2103     }
2104
2105     if (flags & GMX_FORCE_ENERGY)
2106     {
2107         /* Sum the potential energy terms from group contributions */
2108         sum_epot(&(enerd->grpp), enerd->term);
2109
2110         if (!EI_TPI(inputrec->eI))
2111         {
2112             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2113         }
2114     }
2115
2116 }
2117
2118 void do_force(FILE                                     *fplog,
2119               const t_commrec                          *cr,
2120               const gmx_multisim_t                     *ms,
2121               const t_inputrec                         *inputrec,
2122               gmx::Awh                                 *awh,
2123               gmx_enfrot                               *enforcedRotation,
2124               int64_t                                   step,
2125               t_nrnb                                   *nrnb,
2126               gmx_wallcycle_t                           wcycle,
2127               gmx_localtop_t                           *top,
2128               const gmx_groups_t                       *groups,
2129               matrix                                    box,
2130               gmx::ArrayRefWithPadding<gmx::RVec>       x,     //NOLINT(performance-unnecessary-value-param)
2131               history_t                                *hist,
2132               gmx::ArrayRefWithPadding<gmx::RVec>       force, //NOLINT(performance-unnecessary-value-param)
2133               tensor                                    vir_force,
2134               const t_mdatoms                          *mdatoms,
2135               gmx_enerdata_t                           *enerd,
2136               t_fcdata                                 *fcd,
2137               gmx::ArrayRef<real>                       lambda,
2138               t_graph                                  *graph,
2139               t_forcerec                               *fr,
2140               gmx::PpForceWorkload                     *ppForceWorkload,
2141               const gmx_vsite_t                        *vsite,
2142               rvec                                      mu_tot,
2143               double                                    t,
2144               gmx_edsam                                *ed,
2145               int                                       flags,
2146               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2147               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2148 {
2149     /* modify force flag if not doing nonbonded */
2150     if (!fr->bNonbonded)
2151     {
2152         flags &= ~GMX_FORCE_NONBONDED;
2153     }
2154
2155     switch (inputrec->cutoff_scheme)
2156     {
2157         case ecutsVERLET:
2158             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2159                                 awh, enforcedRotation, step, nrnb, wcycle,
2160                                 top,
2161                                 groups,
2162                                 box, x, hist,
2163                                 force, vir_force,
2164                                 mdatoms,
2165                                 enerd, fcd,
2166                                 lambda.data(), graph,
2167                                 fr,
2168                                 ppForceWorkload,
2169                                 fr->ic,
2170                                 vsite, mu_tot,
2171                                 t, ed,
2172                                 flags,
2173                                 ddOpenBalanceRegion,
2174                                 ddCloseBalanceRegion);
2175             break;
2176         case ecutsGROUP:
2177             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2178                                awh, enforcedRotation, step, nrnb, wcycle,
2179                                top,
2180                                groups,
2181                                box, x, hist,
2182                                force, vir_force,
2183                                mdatoms,
2184                                enerd, fcd,
2185                                lambda.data(), graph,
2186                                fr, vsite, mu_tot,
2187                                t, ed,
2188                                flags,
2189                                ddOpenBalanceRegion,
2190                                ddCloseBalanceRegion);
2191             break;
2192         default:
2193             gmx_incons("Invalid cut-off scheme passed!");
2194     }
2195
2196     /* In case we don't have constraints and are using GPUs, the next balancing
2197      * region starts here.
2198      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2199      * virial calculation and COM pulling, is not thus not included in
2200      * the balance timing, which is ok as most tasks do communication.
2201      */
2202     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2203     {
2204         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2205     }
2206 }
2207
2208
2209 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2210                         const t_inputrec *ir, const t_mdatoms *md,
2211                         t_state *state)
2212 {
2213     int             i, m, start, end;
2214     int64_t         step;
2215     real            dt = ir->delta_t;
2216     real            dvdl_dum;
2217     rvec           *savex;
2218
2219     /* We need to allocate one element extra, since we might use
2220      * (unaligned) 4-wide SIMD loads to access rvec entries.
2221      */
2222     snew(savex, state->natoms + 1);
2223
2224     start = 0;
2225     end   = md->homenr;
2226
2227     if (debug)
2228     {
2229         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2230                 start, md->homenr, end);
2231     }
2232     /* Do a first constrain to reset particles... */
2233     step = ir->init_step;
2234     if (fplog)
2235     {
2236         char buf[STEPSTRSIZE];
2237         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2238                 gmx_step_str(step, buf));
2239     }
2240     dvdl_dum = 0;
2241
2242     /* constrain the current position */
2243     constr->apply(TRUE, FALSE,
2244                   step, 0, 1.0,
2245                   state->x.rvec_array(), state->x.rvec_array(), nullptr,
2246                   state->box,
2247                   state->lambda[efptBONDED], &dvdl_dum,
2248                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2249     if (EI_VV(ir->eI))
2250     {
2251         /* constrain the inital velocity, and save it */
2252         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2253         constr->apply(TRUE, FALSE,
2254                       step, 0, 1.0,
2255                       state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2256                       state->box,
2257                       state->lambda[efptBONDED], &dvdl_dum,
2258                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2259     }
2260     /* constrain the inital velocities at t-dt/2 */
2261     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2262     {
2263         auto x = makeArrayRef(state->x).subArray(start, end);
2264         auto v = makeArrayRef(state->v).subArray(start, end);
2265         for (i = start; (i < end); i++)
2266         {
2267             for (m = 0; (m < DIM); m++)
2268             {
2269                 /* Reverse the velocity */
2270                 v[i][m] = -v[i][m];
2271                 /* Store the position at t-dt in buf */
2272                 savex[i][m] = x[i][m] + dt*v[i][m];
2273             }
2274         }
2275         /* Shake the positions at t=-dt with the positions at t=0
2276          * as reference coordinates.
2277          */
2278         if (fplog)
2279         {
2280             char buf[STEPSTRSIZE];
2281             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2282                     gmx_step_str(step, buf));
2283         }
2284         dvdl_dum = 0;
2285         constr->apply(TRUE, FALSE,
2286                       step, -1, 1.0,
2287                       state->x.rvec_array(), savex, nullptr,
2288                       state->box,
2289                       state->lambda[efptBONDED], &dvdl_dum,
2290                       state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2291
2292         for (i = start; i < end; i++)
2293         {
2294             for (m = 0; m < DIM; m++)
2295             {
2296                 /* Re-reverse the velocities */
2297                 v[i][m] = -v[i][m];
2298             }
2299         }
2300     }
2301     sfree(savex);
2302 }
2303
2304
2305 static void
2306 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2307                 double *enerout, double *virout)
2308 {
2309     double enersum, virsum;
2310     double invscale, invscale2, invscale3;
2311     double r, ea, eb, ec, pa, pb, pc, pd;
2312     double y0, f, g, h;
2313     int    ri, offset;
2314     double tabfactor;
2315
2316     invscale  = 1.0/scale;
2317     invscale2 = invscale*invscale;
2318     invscale3 = invscale*invscale2;
2319
2320     /* Following summation derived from cubic spline definition,
2321      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2322      * the cubic spline.  We first calculate the negative of the
2323      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2324      * add the more standard, abrupt cutoff correction to that result,
2325      * yielding the long-range correction for a switched function.  We
2326      * perform both the pressure and energy loops at the same time for
2327      * simplicity, as the computational cost is low. */
2328
2329     if (offstart == 0)
2330     {
2331         /* Since the dispersion table has been scaled down a factor
2332          * 6.0 and the repulsion a factor 12.0 to compensate for the
2333          * c6/c12 parameters inside nbfp[] being scaled up (to save
2334          * flops in kernels), we need to correct for this.
2335          */
2336         tabfactor = 6.0;
2337     }
2338     else
2339     {
2340         tabfactor = 12.0;
2341     }
2342
2343     enersum = 0.0;
2344     virsum  = 0.0;
2345     for (ri = rstart; ri < rend; ++ri)
2346     {
2347         r  = ri*invscale;
2348         ea = invscale3;
2349         eb = 2.0*invscale2*r;
2350         ec = invscale*r*r;
2351
2352         pa = invscale3;
2353         pb = 3.0*invscale2*r;
2354         pc = 3.0*invscale*r*r;
2355         pd = r*r*r;
2356
2357         /* this "8" is from the packing in the vdwtab array - perhaps
2358            should be defined? */
2359
2360         offset = 8*ri + offstart;
2361         y0     = vdwtab[offset];
2362         f      = vdwtab[offset+1];
2363         g      = vdwtab[offset+2];
2364         h      = vdwtab[offset+3];
2365
2366         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2367         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2368     }
2369     *enerout = 4.0*M_PI*enersum*tabfactor;
2370     *virout  = 4.0*M_PI*virsum*tabfactor;
2371 }
2372
2373 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2374 {
2375     double   eners[2], virs[2], enersum, virsum;
2376     double   r0, rc3, rc9;
2377     int      ri0, ri1, i;
2378     real     scale, *vdwtab;
2379
2380     fr->enershiftsix    = 0;
2381     fr->enershifttwelve = 0;
2382     fr->enerdiffsix     = 0;
2383     fr->enerdifftwelve  = 0;
2384     fr->virdiffsix      = 0;
2385     fr->virdifftwelve   = 0;
2386
2387     const interaction_const_t *ic = fr->ic;
2388
2389     if (eDispCorr != edispcNO)
2390     {
2391         for (i = 0; i < 2; i++)
2392         {
2393             eners[i] = 0;
2394             virs[i]  = 0;
2395         }
2396         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2397             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2398             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2399             (ic->vdwtype == evdwSHIFT) ||
2400             (ic->vdwtype == evdwSWITCH))
2401         {
2402             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2403                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2404                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2405             {
2406                 gmx_fatal(FARGS,
2407                           "With dispersion correction rvdw-switch can not be zero "
2408                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2409             }
2410
2411             /* TODO This code depends on the logic in tables.c that
2412                constructs the table layout, which should be made
2413                explicit in future cleanup. */
2414             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2415                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2416             scale  = fr->dispersionCorrectionTable->scale;
2417             vdwtab = fr->dispersionCorrectionTable->data;
2418
2419             /* Round the cut-offs to exact table values for precision */
2420             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2421             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2422
2423             /* The code below has some support for handling force-switching, i.e.
2424              * when the force (instead of potential) is switched over a limited
2425              * region. This leads to a constant shift in the potential inside the
2426              * switching region, which we can handle by adding a constant energy
2427              * term in the force-switch case just like when we do potential-shift.
2428              *
2429              * For now this is not enabled, but to keep the functionality in the
2430              * code we check separately for switch and shift. When we do force-switch
2431              * the shifting point is rvdw_switch, while it is the cutoff when we
2432              * have a classical potential-shift.
2433              *
2434              * For a pure potential-shift the potential has a constant shift
2435              * all the way out to the cutoff, and that is it. For other forms
2436              * we need to calculate the constant shift up to the point where we
2437              * start modifying the potential.
2438              */
2439             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2440
2441             r0   = ri0/scale;
2442             rc3  = r0*r0*r0;
2443             rc9  = rc3*rc3*rc3;
2444
2445             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2446                 (ic->vdwtype == evdwSHIFT))
2447             {
2448                 /* Determine the constant energy shift below rvdw_switch.
2449                  * Table has a scale factor since we have scaled it down to compensate
2450                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2451                  */
2452                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2453                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2454             }
2455             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2456             {
2457                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2458                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2459             }
2460
2461             /* Add the constant part from 0 to rvdw_switch.
2462              * This integration from 0 to rvdw_switch overcounts the number
2463              * of interactions by 1, as it also counts the self interaction.
2464              * We will correct for this later.
2465              */
2466             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2467             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2468
2469             /* Calculate the contribution in the range [r0,r1] where we
2470              * modify the potential. For a pure potential-shift modifier we will
2471              * have ri0==ri1, and there will not be any contribution here.
2472              */
2473             for (i = 0; i < 2; i++)
2474             {
2475                 enersum = 0;
2476                 virsum  = 0;
2477                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2478                 eners[i] -= enersum;
2479                 virs[i]  -= virsum;
2480             }
2481
2482             /* Alright: Above we compensated by REMOVING the parts outside r0
2483              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2484              *
2485              * Regardless of whether r0 is the point where we start switching,
2486              * or the cutoff where we calculated the constant shift, we include
2487              * all the parts we are missing out to infinity from r0 by
2488              * calculating the analytical dispersion correction.
2489              */
2490             eners[0] += -4.0*M_PI/(3.0*rc3);
2491             eners[1] +=  4.0*M_PI/(9.0*rc9);
2492             virs[0]  +=  8.0*M_PI/rc3;
2493             virs[1]  += -16.0*M_PI/(3.0*rc9);
2494         }
2495         else if (ic->vdwtype == evdwCUT ||
2496                  EVDW_PME(ic->vdwtype) ||
2497                  ic->vdwtype == evdwUSER)
2498         {
2499             if (ic->vdwtype == evdwUSER && fplog)
2500             {
2501                 fprintf(fplog,
2502                         "WARNING: using dispersion correction with user tables\n");
2503             }
2504
2505             /* Note that with LJ-PME, the dispersion correction is multiplied
2506              * by the difference between the actual C6 and the value of C6
2507              * that would produce the combination rule.
2508              * This means the normal energy and virial difference formulas
2509              * can be used here.
2510              */
2511
2512             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2513             rc9  = rc3*rc3*rc3;
2514             /* Contribution beyond the cut-off */
2515             eners[0] += -4.0*M_PI/(3.0*rc3);
2516             eners[1] +=  4.0*M_PI/(9.0*rc9);
2517             if (ic->vdw_modifier == eintmodPOTSHIFT)
2518             {
2519                 /* Contribution within the cut-off */
2520                 eners[0] += -4.0*M_PI/(3.0*rc3);
2521                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2522             }
2523             /* Contribution beyond the cut-off */
2524             virs[0]  +=  8.0*M_PI/rc3;
2525             virs[1]  += -16.0*M_PI/(3.0*rc9);
2526         }
2527         else
2528         {
2529             gmx_fatal(FARGS,
2530                       "Dispersion correction is not implemented for vdw-type = %s",
2531                       evdw_names[ic->vdwtype]);
2532         }
2533
2534         /* When we deprecate the group kernels the code below can go too */
2535         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2536         {
2537             /* Calculate self-interaction coefficient (assuming that
2538              * the reciprocal-space contribution is constant in the
2539              * region that contributes to the self-interaction).
2540              */
2541             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2542
2543             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2544             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2545         }
2546
2547         fr->enerdiffsix    = eners[0];
2548         fr->enerdifftwelve = eners[1];
2549         /* The 0.5 is due to the Gromacs definition of the virial */
2550         fr->virdiffsix     = 0.5*virs[0];
2551         fr->virdifftwelve  = 0.5*virs[1];
2552     }
2553 }
2554
2555 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2556                    const matrix box, real lambda, tensor pres, tensor virial,
2557                    real *prescorr, real *enercorr, real *dvdlcorr)
2558 {
2559     gmx_bool bCorrAll, bCorrPres;
2560     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2561     int      m;
2562
2563     *prescorr = 0;
2564     *enercorr = 0;
2565     *dvdlcorr = 0;
2566
2567     clear_mat(virial);
2568     clear_mat(pres);
2569
2570     if (ir->eDispCorr != edispcNO)
2571     {
2572         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2573                      ir->eDispCorr == edispcAllEnerPres);
2574         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2575                      ir->eDispCorr == edispcAllEnerPres);
2576
2577         invvol = 1/det(box);
2578         if (fr->n_tpi)
2579         {
2580             /* Only correct for the interactions with the inserted molecule */
2581             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2582             ninter = fr->n_tpi;
2583         }
2584         else
2585         {
2586             dens   = fr->numAtomsForDispersionCorrection*invvol;
2587             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2588         }
2589
2590         if (ir->efep == efepNO)
2591         {
2592             avcsix    = fr->avcsix[0];
2593             avctwelve = fr->avctwelve[0];
2594         }
2595         else
2596         {
2597             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2598             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2599         }
2600
2601         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2602         *enercorr += avcsix*enerdiff;
2603         dvdlambda  = 0.0;
2604         if (ir->efep != efepNO)
2605         {
2606             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2607         }
2608         if (bCorrAll)
2609         {
2610             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2611             *enercorr += avctwelve*enerdiff;
2612             if (fr->efep != efepNO)
2613             {
2614                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2615             }
2616         }
2617
2618         if (bCorrPres)
2619         {
2620             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2621             if (ir->eDispCorr == edispcAllEnerPres)
2622             {
2623                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2624             }
2625             /* The factor 2 is because of the Gromacs virial definition */
2626             spres = -2.0*invvol*svir*PRESFAC;
2627
2628             for (m = 0; m < DIM; m++)
2629             {
2630                 virial[m][m] += svir;
2631                 pres[m][m]   += spres;
2632             }
2633             *prescorr += spres;
2634         }
2635
2636         /* Can't currently control when it prints, for now, just print when degugging */
2637         if (debug)
2638         {
2639             if (bCorrAll)
2640             {
2641                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2642                         avcsix, avctwelve);
2643             }
2644             if (bCorrPres)
2645             {
2646                 fprintf(debug,
2647                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2648                         *enercorr, spres, svir);
2649             }
2650             else
2651             {
2652                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2653             }
2654         }
2655
2656         if (fr->efep != efepNO)
2657         {
2658             *dvdlcorr += dvdlambda;
2659         }
2660     }
2661 }
2662
2663 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2664                             const gmx_mtop_t *mtop, rvec x[],
2665                             gmx_bool bFirst)
2666 {
2667     t_graph        *graph;
2668     int             as, mol;
2669
2670     if (bFirst && fplog)
2671     {
2672         fprintf(fplog, "Removing pbc first time\n");
2673     }
2674
2675     snew(graph, 1);
2676     as = 0;
2677     for (const gmx_molblock_t &molb : mtop->molblock)
2678     {
2679         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2680         if (moltype.atoms.nr == 1 ||
2681             (!bFirst && moltype.cgs.nr == 1))
2682         {
2683             /* Just one atom or charge group in the molecule, no PBC required */
2684             as += molb.nmol*moltype.atoms.nr;
2685         }
2686         else
2687         {
2688             mk_graph_moltype(moltype, graph);
2689
2690             for (mol = 0; mol < molb.nmol; mol++)
2691             {
2692                 mk_mshift(fplog, graph, ePBC, box, x+as);
2693
2694                 shift_self(graph, box, x+as);
2695                 /* The molecule is whole now.
2696                  * We don't need the second mk_mshift call as in do_pbc_first,
2697                  * since we no longer need this graph.
2698                  */
2699
2700                 as += moltype.atoms.nr;
2701             }
2702             done_graph(graph);
2703         }
2704     }
2705     sfree(graph);
2706 }
2707
2708 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2709                        const gmx_mtop_t *mtop, rvec x[])
2710 {
2711     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2712 }
2713
2714 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2715                  const gmx_mtop_t *mtop, rvec x[])
2716 {
2717     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2718 }
2719
2720 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2721 {
2722     int t, nth;
2723     nth = gmx_omp_nthreads_get(emntDefault);
2724
2725 #pragma omp parallel for num_threads(nth) schedule(static)
2726     for (t = 0; t < nth; t++)
2727     {
2728         try
2729         {
2730             size_t natoms = x.size();
2731             size_t offset = (natoms*t    )/nth;
2732             size_t len    = (natoms*(t + 1))/nth - offset;
2733             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2734         }
2735         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2736     }
2737 }
2738
2739 // TODO This can be cleaned up a lot, and move back to runner.cpp
2740 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2741                 const t_inputrec *inputrec,
2742                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2743                 gmx_walltime_accounting_t walltime_accounting,
2744                 nonbonded_verlet_t *nbv,
2745                 const gmx_pme_t *pme,
2746                 gmx_bool bWriteStat)
2747 {
2748     t_nrnb *nrnb_tot = nullptr;
2749     double  delta_t  = 0;
2750     double  nbfs     = 0, mflop = 0;
2751     double  elapsed_time,
2752             elapsed_time_over_all_ranks,
2753             elapsed_time_over_all_threads,
2754             elapsed_time_over_all_threads_over_all_ranks;
2755     /* Control whether it is valid to print a report. Only the
2756        simulation master may print, but it should not do so if the run
2757        terminated e.g. before a scheduled reset step. This is
2758        complicated by the fact that PME ranks are unaware of the
2759        reason why they were sent a pmerecvqxFINISH. To avoid
2760        communication deadlocks, we always do the communication for the
2761        report, even if we've decided not to write the report, because
2762        how long it takes to finish the run is not important when we've
2763        decided not to report on the simulation performance.
2764
2765        Further, we only report performance for dynamical integrators,
2766        because those are the only ones for which we plan to
2767        consider doing any optimizations. */
2768     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2769
2770     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2771     {
2772         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2773         printReport = false;
2774     }
2775
2776     if (cr->nnodes > 1)
2777     {
2778         snew(nrnb_tot, 1);
2779 #if GMX_MPI
2780         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2781                       cr->mpi_comm_mysim);
2782 #endif
2783     }
2784     else
2785     {
2786         nrnb_tot = nrnb;
2787     }
2788
2789     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2790     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2791     if (cr->nnodes > 1)
2792     {
2793 #if GMX_MPI
2794         /* reduce elapsed_time over all MPI ranks in the current simulation */
2795         MPI_Allreduce(&elapsed_time,
2796                       &elapsed_time_over_all_ranks,
2797                       1, MPI_DOUBLE, MPI_SUM,
2798                       cr->mpi_comm_mysim);
2799         elapsed_time_over_all_ranks /= cr->nnodes;
2800         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2801          * current simulation. */
2802         MPI_Allreduce(&elapsed_time_over_all_threads,
2803                       &elapsed_time_over_all_threads_over_all_ranks,
2804                       1, MPI_DOUBLE, MPI_SUM,
2805                       cr->mpi_comm_mysim);
2806 #endif
2807     }
2808     else
2809     {
2810         elapsed_time_over_all_ranks                  = elapsed_time;
2811         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2812     }
2813
2814     if (printReport)
2815     {
2816         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2817     }
2818     if (cr->nnodes > 1)
2819     {
2820         sfree(nrnb_tot);
2821     }
2822
2823     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2824     {
2825         print_dd_statistics(cr, inputrec, fplog);
2826     }
2827
2828     /* TODO Move the responsibility for any scaling by thread counts
2829      * to the code that handled the thread region, so that there's a
2830      * mechanism to keep cycle counting working during the transition
2831      * to task parallelism. */
2832     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2833     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2834     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2835     auto cycle_sum(wallcycle_sum(cr, wcycle));
2836
2837     if (printReport)
2838     {
2839         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2840         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2841         if (pme_gpu_task_enabled(pme))
2842         {
2843             pme_gpu_get_timings(pme, &pme_gpu_timings);
2844         }
2845         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2846                         elapsed_time_over_all_ranks,
2847                         wcycle, cycle_sum,
2848                         nbnxn_gpu_timings,
2849                         &pme_gpu_timings);
2850
2851         if (EI_DYNAMICS(inputrec->eI))
2852         {
2853             delta_t = inputrec->delta_t;
2854         }
2855
2856         if (fplog)
2857         {
2858             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2859                        elapsed_time_over_all_ranks,
2860                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2861                        delta_t, nbfs, mflop);
2862         }
2863         if (bWriteStat)
2864         {
2865             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2866                        elapsed_time_over_all_ranks,
2867                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2868                        delta_t, nbfs, mflop);
2869         }
2870     }
2871 }
2872
2873 void initialize_lambdas(FILE               *fplog,
2874                         const t_inputrec   &ir,
2875                         bool                isMaster,
2876                         int                *fep_state,
2877                         gmx::ArrayRef<real> lambda,
2878                         double             *lam0)
2879 {
2880     /* TODO: Clean up initialization of fep_state and lambda in
2881        t_state.  This function works, but could probably use a logic
2882        rewrite to keep all the different types of efep straight. */
2883
2884     if ((ir.efep == efepNO) && (!ir.bSimTemp))
2885     {
2886         return;
2887     }
2888
2889     const t_lambda *fep = ir.fepvals;
2890     if (isMaster)
2891     {
2892         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2893                                              if checkpoint is set -- a kludge is in for now
2894                                              to prevent this.*/
2895     }
2896
2897     for (int i = 0; i < efptNR; i++)
2898     {
2899         double thisLambda;
2900         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2901         if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2902         {
2903             thisLambda = fep->init_lambda;
2904         }
2905         else
2906         {
2907             thisLambda = fep->all_lambda[i][fep->init_fep_state];
2908         }
2909         if (isMaster)
2910         {
2911             lambda[i] = thisLambda;
2912         }
2913         if (lam0 != nullptr)
2914         {
2915             lam0[i] = thisLambda;
2916         }
2917     }
2918     if (ir.bSimTemp)
2919     {
2920         /* need to rescale control temperatures to match current state */
2921         for (int i = 0; i < ir.opts.ngtc; i++)
2922         {
2923             if (ir.opts.ref_t[i] > 0)
2924             {
2925                 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2926             }
2927         }
2928     }
2929
2930     /* Send to the log the information on the current lambdas */
2931     if (fplog != nullptr)
2932     {
2933         fprintf(fplog, "Initial vector of lambda components:[ ");
2934         for (const auto &l : lambda)
2935         {
2936             fprintf(fplog, "%10.4f ", l);
2937         }
2938         fprintf(fplog, "]\n");
2939     }
2940 }