src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <cmath>
  44 #include <cstdint>
  45 #include <cstdio>
  46 #include <cstring>
  47
  48 #include <array>
  49
  50 #include "gromacs/awh/awh.h"
  51 #include "gromacs/domdec/dlbtiming.h"
  52 #include "gromacs/domdec/domdec.h"
  53 #include "gromacs/domdec/domdec_struct.h"
  54 #include "gromacs/essentialdynamics/edsam.h"
  55 #include "gromacs/ewald/pme.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/imd/imd.h"
  64 #include "gromacs/listed-forces/bonded.h"
  65 #include "gromacs/listed-forces/disre.h"
  66 #include "gromacs/listed-forces/orires.h"
  67 #include "gromacs/math/functions.h"
  68 #include "gromacs/math/units.h"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/math/vecdump.h"
  71 #include "gromacs/mdlib/calcmu.h"
  72 #include "gromacs/mdlib/calcvir.h"
  73 #include "gromacs/mdlib/constr.h"
  74 #include "gromacs/mdlib/force.h"
  75 #include "gromacs/mdlib/forcerec.h"
  76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  77 #include "gromacs/mdlib/mdrun.h"
  78 #include "gromacs/mdlib/nb_verlet.h"
  79 #include "gromacs/mdlib/nbnxn_atomdata.h"
  80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  81 #include "gromacs/mdlib/nbnxn_grid.h"
  82 #include "gromacs/mdlib/nbnxn_search.h"
  83 #include "gromacs/mdlib/qmmm.h"
  84 #include "gromacs/mdlib/update.h"
  85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  86 #include "gromacs/mdtypes/commrec.h"
  87 #include "gromacs/mdtypes/forceoutput.h"
  88 #include "gromacs/mdtypes/iforceprovider.h"
  89 #include "gromacs/mdtypes/inputrec.h"
  90 #include "gromacs/mdtypes/md_enums.h"
  91 #include "gromacs/mdtypes/state.h"
  92 #include "gromacs/pbcutil/ishift.h"
  93 #include "gromacs/pbcutil/mshift.h"
  94 #include "gromacs/pbcutil/pbc.h"
  95 #include "gromacs/pulling/pull.h"
  96 #include "gromacs/pulling/pull_rotation.h"
  97 #include "gromacs/timing/cyclecounter.h"
  98 #include "gromacs/timing/gpu_timing.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/wallcyclereporting.h"
 101 #include "gromacs/timing/walltime_accounting.h"
 102 #include "gromacs/topology/topology.h"
 103 #include "gromacs/utility/arrayref.h"
 104 #include "gromacs/utility/basedefinitions.h"
 105 #include "gromacs/utility/cstringutil.h"
 106 #include "gromacs/utility/exceptions.h"
 107 #include "gromacs/utility/fatalerror.h"
 108 #include "gromacs/utility/gmxassert.h"
 109 #include "gromacs/utility/gmxmpi.h"
 110 #include "gromacs/utility/logger.h"
 111 #include "gromacs/utility/pleasecite.h"
 112 #include "gromacs/utility/smalloc.h"
 113 #include "gromacs/utility/sysinfo.h"
 114
 115 #include "nbnxn_gpu.h"
 116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 118
 119 // TODO: this environment variable allows us to verify before release
 120 // that on less common architectures the total cost of polling is not larger than
 121 // a blocking wait (so polling does not introduce overhead when the static
 122 // PME-first ordering would suffice).
 123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 124
 125
 126 void print_time(FILE                     *out,
 127                 gmx_walltime_accounting_t walltime_accounting,
 128                 int64_t                   step,
 129                 const t_inputrec         *ir,
 130                 const t_commrec          *cr)
 131 {
 132     time_t finish;
 133     char   timebuf[STRLEN];
 134     double dt, elapsed_seconds, time_per_step;
 135     char   buf[48];
 136
 137 #if !GMX_THREAD_MPI
 138     if (!PAR(cr))
 139 #endif
 140     {
 141         fprintf(out, "\r");
 142     }
 143     fprintf(out, "step %s", gmx_step_str(step, buf));
 144     fflush(out);
 145
 146     if ((step >= ir->nstlist))
 147     {
 148         double seconds_since_epoch = gmx_gettime();
 149         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 150         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 151         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 152
 153         if (ir->nsteps >= 0)
 154         {
 155             if (dt >= 300)
 156             {
 157                 finish = static_cast<time_t>(seconds_since_epoch + dt);
 158                 gmx_ctime_r(&finish, timebuf, STRLEN);
 159                 sprintf(buf, "%s", timebuf);
 160                 buf[strlen(buf)-1] = '\0';
 161                 fprintf(out, ", will finish %s", buf);
 162             }
 163             else
 164             {
 165                 fprintf(out, ", remaining wall clock time: %5d s          ", static_cast<int>(dt));
 166             }
 167         }
 168         else
 169         {
 170             fprintf(out, " performance: %.1f ns/day    ",
 171                     ir->delta_t/1000*24*60*60/time_per_step);
 172         }
 173     }
 174 #if !GMX_THREAD_MPI
 175     if (PAR(cr))
 176     {
 177         fprintf(out, "\n");
 178     }
 179 #else
 180     GMX_UNUSED_VALUE(cr);
 181 #endif
 182
 183     fflush(out);
 184 }
 185
 186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 187                          double the_time)
 188 {
 189     char   time_string[STRLEN];
 190
 191     if (!fplog)
 192     {
 193         return;
 194     }
 195
 196     {
 197         int    i;
 198         char   timebuf[STRLEN];
 199         time_t temp_time = static_cast<time_t>(the_time);
 200
 201         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 202         for (i = 0; timebuf[i] >= ' '; i++)
 203         {
 204             time_string[i] = timebuf[i];
 205         }
 206         time_string[i] = '\0';
 207     }
 208
 209     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 210 }
 211
 212 void print_start(FILE *fplog, const t_commrec *cr,
 213                  gmx_walltime_accounting_t walltime_accounting,
 214                  const char *name)
 215 {
 216     char buf[STRLEN];
 217
 218     sprintf(buf, "Started %s", name);
 219     print_date_and_time(fplog, cr->nodeid, buf,
 220                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 221 }
 222
 223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 224 {
 225     const int      end = forceToAdd.size();
 226
 227     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 228 #pragma omp parallel for num_threads(nt) schedule(static)
 229     for (int i = 0; i < end; i++)
 230     {
 231         rvec_inc(f[i], forceToAdd[i]);
 232     }
 233 }
 234
 235 static void pme_gpu_reduce_outputs(gmx_wallcycle_t                 wcycle,
 236                                    gmx::ForceWithVirial           *forceWithVirial,
 237                                    gmx::ArrayRef<const gmx::RVec>  pmeForces,
 238                                    gmx_enerdata_t                 *enerd,
 239                                    const tensor                    vir_Q,
 240                                    real                            Vlr_q)
 241 {
 242     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
 243     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 244     forceWithVirial->addVirialContribution(vir_Q);
 245     enerd->term[F_COUL_RECIP] += Vlr_q;
 246     sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
 247     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 248 }
 249
 250 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
 251                         tensor vir_part, const t_graph *graph, const matrix box,
 252                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 253 {
 254     /* The short-range virial from surrounding boxes */
 255     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 256     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 257
 258     /* Calculate partial virial, for local atoms only, based on short range.
 259      * Total virial is computed in global_stat, called from do_md
 260      */
 261     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 262     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 263
 264     if (debug)
 265     {
 266         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 267     }
 268 }
 269
 270 static void pull_potential_wrapper(const t_commrec *cr,
 271                                    const t_inputrec *ir,
 272                                    const matrix box, gmx::ArrayRef<const gmx::RVec> x,
 273                                    gmx::ForceWithVirial *force,
 274                                    const t_mdatoms *mdatoms,
 275                                    gmx_enerdata_t *enerd,
 276                                    const real *lambda,
 277                                    double t,
 278                                    gmx_wallcycle_t wcycle)
 279 {
 280     t_pbc  pbc;
 281     real   dvdl;
 282
 283     /* Calculate the center of mass forces, this requires communication,
 284      * which is why pull_potential is called close to other communication.
 285      */
 286     wallcycle_start(wcycle, ewcPULLPOT);
 287     set_pbc(&pbc, ir->ePBC, box);
 288     dvdl                     = 0;
 289     enerd->term[F_COM_PULL] +=
 290         pull_potential(ir->pull_work, mdatoms, &pbc,
 291                        cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
 292     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 293     wallcycle_stop(wcycle, ewcPULLPOT);
 294 }
 295
 296 static void pme_receive_force_ener(const t_commrec      *cr,
 297                                    gmx::ForceWithVirial *forceWithVirial,
 298                                    gmx_enerdata_t       *enerd,
 299                                    gmx_wallcycle_t       wcycle)
 300 {
 301     real   e_q, e_lj, dvdl_q, dvdl_lj;
 302     float  cycles_ppdpme, cycles_seppme;
 303
 304     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 305     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 306
 307     /* In case of node-splitting, the PP nodes receive the long-range
 308      * forces, virial and energy from the PME nodes here.
 309      */
 310     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 311     dvdl_q  = 0;
 312     dvdl_lj = 0;
 313     gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
 314                       &cycles_seppme);
 315     enerd->term[F_COUL_RECIP] += e_q;
 316     enerd->term[F_LJ_RECIP]   += e_lj;
 317     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 318     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 319
 320     if (wcycle)
 321     {
 322         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 323     }
 324     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 325 }
 326
 327 static void print_large_forces(FILE            *fp,
 328                                const t_mdatoms *md,
 329                                const t_commrec *cr,
 330                                int64_t          step,
 331                                real             forceTolerance,
 332                                const rvec      *x,
 333                                const rvec      *f)
 334 {
 335     real           force2Tolerance = gmx::square(forceTolerance);
 336     gmx::index     numNonFinite    = 0;
 337     for (int i = 0; i < md->homenr; i++)
 338     {
 339         real force2    = norm2(f[i]);
 340         bool nonFinite = !std::isfinite(force2);
 341         if (force2 >= force2Tolerance || nonFinite)
 342         {
 343             fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 344                     step,
 345                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 346         }
 347         if (nonFinite)
 348         {
 349             numNonFinite++;
 350         }
 351     }
 352     if (numNonFinite > 0)
 353     {
 354         /* Note that with MPI this fatal call on one rank might interrupt
 355          * the printing on other ranks. But we can only avoid that with
 356          * an expensive MPI barrier that we would need at each step.
 357          */
 358         gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
 359     }
 360 }
 361
 362 static void post_process_forces(const t_commrec           *cr,
 363                                 int64_t                    step,
 364                                 t_nrnb                    *nrnb,
 365                                 gmx_wallcycle_t            wcycle,
 366                                 const gmx_localtop_t      *top,
 367                                 const matrix               box,
 368                                 const rvec                 x[],
 369                                 rvec                       f[],
 370                                 gmx::ForceWithVirial      *forceWithVirial,
 371                                 tensor                     vir_force,
 372                                 const t_mdatoms           *mdatoms,
 373                                 const t_graph             *graph,
 374                                 const t_forcerec          *fr,
 375                                 const gmx_vsite_t         *vsite,
 376                                 int                        flags)
 377 {
 378     if (fr->haveDirectVirialContributions)
 379     {
 380         rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
 381
 382         if (vsite)
 383         {
 384             /* Spread the mesh force on virtual sites to the other particles...
 385              * This is parallellized. MPI communication is performed
 386              * if the constructing atoms aren't local.
 387              */
 388             matrix virial = { { 0 } };
 389             spread_vsite_f(vsite, x, fDirectVir, nullptr,
 390                            (flags & GMX_FORCE_VIRIAL) != 0, virial,
 391                            nrnb,
 392                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
 393             forceWithVirial->addVirialContribution(virial);
 394         }
 395
 396         if (flags & GMX_FORCE_VIRIAL)
 397         {
 398             /* Now add the forces, this is local */
 399             sum_forces(f, forceWithVirial->force_);
 400
 401             /* Add the direct virial contributions */
 402             GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
 403             m_add(vir_force, forceWithVirial->getVirial(), vir_force);
 404
 405             if (debug)
 406             {
 407                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 408             }
 409         }
 410     }
 411
 412     if (fr->print_force >= 0)
 413     {
 414         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 415     }
 416 }
 417
 418 static void do_nb_verlet(const t_forcerec *fr,
 419                          const interaction_const_t *ic,
 420                          gmx_enerdata_t *enerd,
 421                          int flags, int ilocality,
 422                          int clearF,
 423                          int64_t step,
 424                          t_nrnb *nrnb,
 425                          gmx_wallcycle_t wcycle)
 426 {
 427     if (!(flags & GMX_FORCE_NONBONDED))
 428     {
 429         /* skip non-bonded calculation */
 430         return;
 431     }
 432
 433     nonbonded_verlet_t       *nbv  = fr->nbv;
 434     nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 435
 436     /* GPU kernel launch overhead is already timed separately */
 437     if (fr->cutoff_scheme != ecutsVERLET)
 438     {
 439         gmx_incons("Invalid cut-off scheme passed!");
 440     }
 441
 442     bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 443
 444     if (!bUsingGpuKernels)
 445     {
 446         /* When dynamic pair-list  pruning is requested, we need to prune
 447          * at nstlistPrune steps.
 448          */
 449         if (nbv->listParams->useDynamicPruning &&
 450             (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
 451         {
 452             /* Prune the pair-list beyond fr->ic->rlistPrune using
 453              * the current coordinates of the atoms.
 454              */
 455             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
 456             nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
 457             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
 458         }
 459
 460         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 461     }
 462
 463     switch (nbvg->kernel_type)
 464     {
 465         case nbnxnk4x4_PlainC:
 466         case nbnxnk4xN_SIMD_4xN:
 467         case nbnxnk4xN_SIMD_2xNN:
 468             nbnxn_kernel_cpu(nbvg,
 469                              nbv->nbat,
 470                              ic,
 471                              fr->shift_vec,
 472                              flags,
 473                              clearF,
 474                              fr->fshift[0],
 475                              enerd->grpp.ener[egCOULSR],
 476                              fr->bBHAM ?
 477                              enerd->grpp.ener[egBHAMSR] :
 478                              enerd->grpp.ener[egLJSR]);
 479             break;
 480
 481         case nbnxnk8x8x8_GPU:
 482             nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
 483             break;
 484
 485         case nbnxnk8x8x8_PlainC:
 486             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 487                                  nbv->nbat, ic,
 488                                  fr->shift_vec,
 489                                  flags,
 490                                  clearF,
 491                                  nbv->nbat->out[0].f,
 492                                  fr->fshift[0],
 493                                  enerd->grpp.ener[egCOULSR],
 494                                  fr->bBHAM ?
 495                                  enerd->grpp.ener[egBHAMSR] :
 496                                  enerd->grpp.ener[egLJSR]);
 497             break;
 498
 499         default:
 500             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 501
 502     }
 503     if (!bUsingGpuKernels)
 504     {
 505         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 506     }
 507
 508     int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 509     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 510     {
 511         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 512     }
 513     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 514              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
 515     {
 516         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 517     }
 518     else
 519     {
 520         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 521     }
 522     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 523     if (flags & GMX_FORCE_ENERGY)
 524     {
 525         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 526         enr_nbnxn_kernel_ljc += 1;
 527         enr_nbnxn_kernel_lj  += 1;
 528     }
 529
 530     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 531              nbvg->nbl_lists.natpair_ljq);
 532     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 533              nbvg->nbl_lists.natpair_lj);
 534     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 535     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 536              nbvg->nbl_lists.natpair_q);
 537
 538     if (ic->vdw_modifier == eintmodFORCESWITCH)
 539     {
 540         /* We add up the switch cost separately */
 541         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 542                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 543     }
 544     if (ic->vdw_modifier == eintmodPOTSWITCH)
 545     {
 546         /* We add up the switch cost separately */
 547         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 548                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 549     }
 550     if (ic->vdwtype == evdwPME)
 551     {
 552         /* We add up the LJ Ewald cost separately */
 553         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 554                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 555     }
 556 }
 557
 558 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 559                              t_forcerec           *fr,
 560                              rvec                  x[],
 561                              rvec                  f[],
 562                              const t_mdatoms      *mdatoms,
 563                              t_lambda             *fepvals,
 564                              real                 *lambda,
 565                              gmx_enerdata_t       *enerd,
 566                              int                   flags,
 567                              t_nrnb               *nrnb,
 568                              gmx_wallcycle_t       wcycle)
 569 {
 570     int              donb_flags;
 571     nb_kernel_data_t kernel_data;
 572     real             lam_i[efptNR];
 573     real             dvdl_nb[efptNR];
 574     int              th;
 575     int              i, j;
 576
 577     donb_flags = 0;
 578     /* Add short-range interactions */
 579     donb_flags |= GMX_NONBONDED_DO_SR;
 580
 581     /* Currently all group scheme kernels always calculate (shift-)forces */
 582     if (flags & GMX_FORCE_FORCES)
 583     {
 584         donb_flags |= GMX_NONBONDED_DO_FORCE;
 585     }
 586     if (flags & GMX_FORCE_VIRIAL)
 587     {
 588         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 589     }
 590     if (flags & GMX_FORCE_ENERGY)
 591     {
 592         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 593     }
 594
 595     kernel_data.flags  = donb_flags;
 596     kernel_data.lambda = lambda;
 597     kernel_data.dvdl   = dvdl_nb;
 598
 599     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 600     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 601
 602     /* reset free energy components */
 603     for (i = 0; i < efptNR; i++)
 604     {
 605         dvdl_nb[i]  = 0;
 606     }
 607
 608     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 609
 610     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 611 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 612     for (th = 0; th < nbl_lists->nnbl; th++)
 613     {
 614         try
 615         {
 616             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 617                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 618         }
 619         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 620     }
 621
 622     if (fepvals->sc_alpha != 0)
 623     {
 624         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 625         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 626     }
 627     else
 628     {
 629         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 630         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 631     }
 632
 633     /* If we do foreign lambda and we have soft-core interactions
 634      * we have to recalculate the (non-linear) energies contributions.
 635      */
 636     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 637     {
 638         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 639         kernel_data.lambda         = lam_i;
 640         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 641         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 642         /* Note that we add to kernel_data.dvdl, but ignore the result */
 643
 644         for (i = 0; i < enerd->n_lambda; i++)
 645         {
 646             for (j = 0; j < efptNR; j++)
 647             {
 648                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 649             }
 650             reset_foreign_enerdata(enerd);
 651 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 652             for (th = 0; th < nbl_lists->nnbl; th++)
 653             {
 654                 try
 655                 {
 656                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 657                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 658                 }
 659                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 660             }
 661
 662             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 663             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 664         }
 665     }
 666
 667     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 668 }
 669
 670 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 671 {
 672     return nbv != nullptr && nbv->bUseGPU;
 673 }
 674
 675 static inline void clear_rvecs_omp(int n, rvec v[])
 676 {
 677     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 678
 679     /* Note that we would like to avoid this conditional by putting it
 680      * into the omp pragma instead, but then we still take the full
 681      * omp parallel for overhead (at least with gcc5).
 682      */
 683     if (nth == 1)
 684     {
 685         for (int i = 0; i < n; i++)
 686         {
 687             clear_rvec(v[i]);
 688         }
 689     }
 690     else
 691     {
 692 #pragma omp parallel for num_threads(nth) schedule(static)
 693         for (int i = 0; i < n; i++)
 694         {
 695             clear_rvec(v[i]);
 696         }
 697     }
 698 }
 699
 700 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
 701  *
 702  * \param groupOptions  Group options, containing T-coupling options
 703  */
 704 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
 705 {
 706     real nrdfCoupled   = 0;
 707     real nrdfUncoupled = 0;
 708     real kineticEnergy = 0;
 709     for (int g = 0; g < groupOptions.ngtc; g++)
 710     {
 711         if (groupOptions.tau_t[g] >= 0)
 712         {
 713             nrdfCoupled   += groupOptions.nrdf[g];
 714             kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
 715         }
 716         else
 717         {
 718             nrdfUncoupled += groupOptions.nrdf[g];
 719         }
 720     }
 721
 722     /* This conditional with > also catches nrdf=0 */
 723     if (nrdfCoupled > nrdfUncoupled)
 724     {
 725         return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
 726     }
 727     else
 728     {
 729         return 0;
 730     }
 731 }
 732
 733 /*! \brief This routine checks that the potential energy is finite.
 734  *
 735  * Always checks that the potential energy is finite. If step equals
 736  * inputrec.init_step also checks that the magnitude of the potential energy
 737  * is reasonable. Terminates with a fatal error when a check fails.
 738  * Note that passing this check does not guarantee finite forces,
 739  * since those use slightly different arithmetics. But in most cases
 740  * there is just a narrow coordinate range where forces are not finite
 741  * and energies are finite.
 742  *
 743  * \param[in] step      The step number, used for checking and printing
 744  * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
 745  * \param[in] inputrec  The input record
 746  */
 747 static void checkPotentialEnergyValidity(int64_t               step,
 748                                          const gmx_enerdata_t &enerd,
 749                                          const t_inputrec     &inputrec)
 750 {
 751     /* Threshold valid for comparing absolute potential energy against
 752      * the kinetic energy. Normally one should not consider absolute
 753      * potential energy values, but with a factor of one million
 754      * we should never get false positives.
 755      */
 756     constexpr real c_thresholdFactor = 1e6;
 757
 758     bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
 759     real           averageKineticEnergy = 0;
 760     /* We only check for large potential energy at the initial step,
 761      * because that is by far the most likely step for this too occur
 762      * and because computing the average kinetic energy is not free.
 763      * Note: nstcalcenergy >> 1 often does not allow to catch large energies
 764      * before they become NaN.
 765      */
 766     if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
 767     {
 768         averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
 769     }
 770
 771     if (energyIsNotFinite || (averageKineticEnergy > 0 &&
 772                               enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
 773     {
 774         gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
 775                   step,
 776                   enerd.term[F_EPOT],
 777                   energyIsNotFinite ? "not finite" : "extremely high",
 778                   enerd.term[F_LJ],
 779                   enerd.term[F_COUL_SR],
 780                   energyIsNotFinite ? "non-finite" : "very high",
 781                   energyIsNotFinite ? " or Nan" : "");
 782     }
 783 }
 784
 785 /*! \brief Compute forces and/or energies for special algorithms
 786  *
 787  * The intention is to collect all calls to algorithms that compute
 788  * forces on local atoms only and that do not contribute to the local
 789  * virial sum (but add their virial contribution separately).
 790  * Eventually these should likely all become ForceProviders.
 791  * Within this function the intention is to have algorithms that do
 792  * global communication at the end, so global barriers within the MD loop
 793  * are as close together as possible.
 794  *
 795  * \param[in]     fplog            The log file
 796  * \param[in]     cr               The communication record
 797  * \param[in]     inputrec         The input record
 798  * \param[in]     awh              The Awh module (nullptr if none in use).
 799  * \param[in]     enforcedRotation Enforced rotation module.
 800  * \param[in]     step             The current MD step
 801  * \param[in]     t                The current time
 802  * \param[in,out] wcycle           Wallcycle accounting struct
 803  * \param[in,out] forceProviders   Pointer to a list of force providers
 804  * \param[in]     box              The unit cell
 805  * \param[in]     x                The coordinates
 806  * \param[in]     mdatoms          Per atom properties
 807  * \param[in]     lambda           Array of free-energy lambda values
 808  * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
 809  * \param[in,out] forceWithVirial  Force and virial buffers
 810  * \param[in,out] enerd            Energy buffer
 811  * \param[in,out] ed               Essential dynamics pointer
 812  * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
 813  *
 814  * \todo Remove bNS, which is used incorrectly.
 815  * \todo Convert all other algorithms called here to ForceProviders.
 816  */
 817 static void
 818 computeSpecialForces(FILE                          *fplog,
 819                      const t_commrec               *cr,
 820                      const t_inputrec              *inputrec,
 821                      gmx::Awh                      *awh,
 822                      gmx_enfrot                    *enforcedRotation,
 823                      int64_t                        step,
 824                      double                         t,
 825                      gmx_wallcycle_t                wcycle,
 826                      ForceProviders                *forceProviders,
 827                      matrix                         box,
 828                      gmx::ArrayRef<const gmx::RVec> x,
 829                      const t_mdatoms               *mdatoms,
 830                      real                          *lambda,
 831                      int                            forceFlags,
 832                      gmx::ForceWithVirial          *forceWithVirial,
 833                      gmx_enerdata_t                *enerd,
 834                      gmx_edsam                     *ed,
 835                      gmx_bool                       bNS)
 836 {
 837     const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
 838
 839     /* NOTE: Currently all ForceProviders only provide forces.
 840      *       When they also provide energies, remove this conditional.
 841      */
 842     if (computeForces)
 843     {
 844         gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
 845         gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
 846
 847         /* Collect forces from modules */
 848         forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
 849     }
 850
 851     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
 852     {
 853         pull_potential_wrapper(cr, inputrec, box, x,
 854                                forceWithVirial,
 855                                mdatoms, enerd, lambda, t,
 856                                wcycle);
 857
 858         if (awh)
 859         {
 860             enerd->term[F_COM_PULL] +=
 861                 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
 862                                                   forceWithVirial,
 863                                                   t, step, wcycle, fplog);
 864         }
 865     }
 866
 867     rvec *f = as_rvec_array(forceWithVirial->force_.data());
 868
 869     /* Add the forces from enforced rotation potentials (if any) */
 870     if (inputrec->bRot)
 871     {
 872         wallcycle_start(wcycle, ewcROTadd);
 873         enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
 874         wallcycle_stop(wcycle, ewcROTadd);
 875     }
 876
 877     if (ed)
 878     {
 879         /* Note that since init_edsam() is called after the initialization
 880          * of forcerec, edsam doesn't request the noVirSum force buffer.
 881          * Thus if no other algorithm (e.g. PME) requires it, the forces
 882          * here will contribute to the virial.
 883          */
 884         do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
 885     }
 886
 887     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
 888     if (inputrec->bIMD && computeForces)
 889     {
 890         IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
 891     }
 892 }
 893
 894 /*! \brief Launch the prepare_step and spread stages of PME GPU.
 895  *
 896  * \param[in]  pmedata       The PME structure
 897  * \param[in]  box           The box matrix
 898  * \param[in]  x             Coordinate array
 899  * \param[in]  flags         Force flags
 900  * \param[in]  wcycle        The wallcycle structure
 901  */
 902 static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
 903                                       matrix          box,
 904                                       rvec            x[],
 905                                       int             flags,
 906                                       gmx_wallcycle_t wcycle)
 907 {
 908     int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
 909     pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
 910     pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
 911
 912     pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
 913     pme_gpu_launch_spread(pmedata, x, wcycle);
 914 }
 915
 916 /*! \brief Launch the FFT and gather stages of PME GPU
 917  *
 918  * This function only implements setting the output forces (no accumulation).
 919  *
 920  * \param[in]  pmedata        The PME structure
 921  * \param[in]  wcycle         The wallcycle structure
 922  */
 923 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
 924                                      gmx_wallcycle_t   wcycle)
 925 {
 926     pme_gpu_launch_complex_transforms(pmedata, wcycle);
 927     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 928 }
 929
 930 /*! \brief
 931  *  Polling wait for either of the PME or nonbonded GPU tasks.
 932  *
 933  * Instead of a static order in waiting for GPU tasks, this function
 934  * polls checking which of the two tasks completes first, and does the
 935  * associated force buffer reduction overlapped with the other task.
 936  * By doing that, unlike static scheduling order, it can always overlap
 937  * one of the reductions, regardless of the GPU task completion order.
 938  *
 939  * \param[in]     nbv              Nonbonded verlet structure
 940  * \param[in]     pmedata          PME module data
 941  * \param[in,out] force            Force array to reduce task outputs into.
 942  * \param[in,out] forceWithVirial  Force and virial buffers
 943  * \param[in,out] fshift           Shift force output vector results are reduced into
 944  * \param[in,out] enerd            Energy data structure results are reduced into
 945  * \param[in]     flags            Force flags
 946  * \param[in]     wcycle           The wallcycle structure
 947  */
 948 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
 949                                         const gmx_pme_t                     *pmedata,
 950                                         gmx::PaddedArrayRef<gmx::RVec>      *force,
 951                                         gmx::ForceWithVirial                *forceWithVirial,
 952                                         rvec                                 fshift[],
 953                                         gmx_enerdata_t                      *enerd,
 954                                         int                                  flags,
 955                                         gmx_wallcycle_t                      wcycle)
 956 {
 957     bool isPmeGpuDone = false;
 958     bool isNbGpuDone  = false;
 959
 960
 961     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 962
 963     while (!isPmeGpuDone || !isNbGpuDone)
 964     {
 965         if (!isPmeGpuDone)
 966         {
 967             matrix            vir_Q;
 968             real              Vlr_q;
 969
 970             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 971             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
 972                                                    vir_Q, &Vlr_q, completionType);
 973
 974             if (isPmeGpuDone)
 975             {
 976                 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
 977                                        enerd, vir_Q, Vlr_q);
 978             }
 979         }
 980
 981         if (!isNbGpuDone)
 982         {
 983             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
 984             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 985             isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
 986                                                     flags, eatLocal,
 987                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 988                                                     fshift, completionType);
 989             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 990             // To get the call count right, when the task finished we
 991             // issue a start/stop.
 992             // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
 993             // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
 994             if (isNbGpuDone)
 995             {
 996                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 997                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 998
 999                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1000                                                nbv->nbat, as_rvec_array(force->data()), wcycle);
1001             }
1002         }
1003     }
1004 }
1005
1006 /*! \brief
1007  *  Launch the dynamic rolling pruning GPU task.
1008  *
1009  *  We currently alternate local/non-local list pruning in odd-even steps
1010  *  (only pruning every second step without DD).
1011  *
1012  * \param[in]     cr               The communication record
1013  * \param[in]     nbv              Nonbonded verlet structure
1014  * \param[in]     inputrec         The input record
1015  * \param[in]     step             The current MD step
1016  */
1017 static inline void launchGpuRollingPruning(const t_commrec          *cr,
1018                                            const nonbonded_verlet_t *nbv,
1019                                            const t_inputrec         *inputrec,
1020                                            const int64_t             step)
1021 {
1022     /* We should not launch the rolling pruning kernel at a search
1023      * step or just before search steps, since that's useless.
1024      * Without domain decomposition we prune at even steps.
1025      * With domain decomposition we alternate local and non-local
1026      * pruning at even and odd steps.
1027      */
1028     int  numRollingParts     = nbv->listParams->numRollingParts;
1029     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1030     int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1031     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
1032     if (stepWithCurrentList > 0 &&
1033         stepWithCurrentList < inputrec->nstlist - 1 &&
1034         (stepIsEven || DOMAINDECOMP(cr)))
1035     {
1036         nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1037                                           stepIsEven ? eintLocal : eintNonlocal,
1038                                           numRollingParts);
1039     }
1040 }
1041
1042 static void do_force_cutsVERLET(FILE *fplog,
1043                                 const t_commrec *cr,
1044                                 const gmx_multisim_t *ms,
1045                                 const t_inputrec *inputrec,
1046                                 gmx::Awh *awh,
1047                                 gmx_enfrot *enforcedRotation,
1048                                 int64_t step,
1049                                 t_nrnb *nrnb,
1050                                 gmx_wallcycle_t wcycle,
1051                                 const gmx_localtop_t *top,
1052                                 const gmx_groups_t * /* groups */,
1053                                 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1054                                 history_t *hist,
1055                                 gmx::PaddedArrayRef<gmx::RVec> force,
1056                                 tensor vir_force,
1057                                 const t_mdatoms *mdatoms,
1058                                 gmx_enerdata_t *enerd, t_fcdata *fcd,
1059                                 real *lambda,
1060                                 t_graph *graph,
1061                                 t_forcerec *fr,
1062                                 interaction_const_t *ic,
1063                                 const gmx_vsite_t *vsite,
1064                                 rvec mu_tot,
1065                                 double t,
1066                                 gmx_edsam *ed,
1067                                 int flags,
1068                                 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1069                                 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1070 {
1071     int                 cg1, i, j;
1072     double              mu[2*DIM];
1073     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
1074     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
1075     rvec                vzero, box_diag;
1076     float               cycles_pme, cycles_wait_gpu;
1077     nonbonded_verlet_t *nbv = fr->nbv;
1078
1079     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1080     bNS           = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1081     bFillGrid     = (bNS && bStateChanged);
1082     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
1083     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
1084     bUseGPU       = fr->nbv->bUseGPU;
1085     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1086
1087     const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1088     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1089     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1090         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1091
1092     /* At a search step we need to start the first balancing region
1093      * somewhere early inside the step after communication during domain
1094      * decomposition (and not during the previous step as usual).
1095      */
1096     if (bNS &&
1097         ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1098     {
1099         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1100     }
1101
1102     cycles_wait_gpu = 0;
1103
1104     const int start  = 0;
1105     const int homenr = mdatoms->homenr;
1106
1107     clear_mat(vir_force);
1108
1109     if (DOMAINDECOMP(cr))
1110     {
1111         cg1 = cr->dd->globalAtomGroupIndices.size();
1112     }
1113     else
1114     {
1115         cg1 = top->cgs.nr;
1116     }
1117     if (fr->n_tpi > 0)
1118     {
1119         cg1--;
1120     }
1121
1122     if (bStateChanged)
1123     {
1124         update_forcerec(fr, box);
1125
1126         if (inputrecNeedMutot(inputrec))
1127         {
1128             /* Calculate total (local) dipole moment in a temporary common array.
1129              * This makes it possible to sum them over nodes faster.
1130              */
1131             calc_mu(start, homenr,
1132                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1133                     mu, mu+DIM);
1134         }
1135     }
1136
1137     if (fr->ePBC != epbcNONE)
1138     {
1139         /* Compute shift vectors every step,
1140          * because of pressure coupling or box deformation!
1141          */
1142         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1143         {
1144             calc_shifts(box, fr->shift_vec);
1145         }
1146
1147         if (bCalcCGCM)
1148         {
1149             put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1150             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1151         }
1152         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1153         {
1154             unshift_self(graph, box, as_rvec_array(x.data()));
1155         }
1156     }
1157
1158     nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1159                                  fr->shift_vec, nbv->nbat);
1160
1161 #if GMX_MPI
1162     if (!thisRankHasDuty(cr, DUTY_PME))
1163     {
1164         /* Send particle coordinates to the pme nodes.
1165          * Since this is only implemented for domain decomposition
1166          * and domain decomposition does not use the graph,
1167          * we do not need to worry about shifting.
1168          */
1169         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1170                                  lambda[efptCOUL], lambda[efptVDW],
1171                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1172                                  step, wcycle);
1173     }
1174 #endif /* GMX_MPI */
1175
1176     if (useGpuPme)
1177     {
1178         launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1179     }
1180
1181     /* do gridding for pair search */
1182     if (bNS)
1183     {
1184         if (graph && bStateChanged)
1185         {
1186             /* Calculate intramolecular shift vectors to make molecules whole */
1187             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1188         }
1189
1190         clear_rvec(vzero);
1191         box_diag[XX] = box[XX][XX];
1192         box_diag[YY] = box[YY][YY];
1193         box_diag[ZZ] = box[ZZ][ZZ];
1194
1195         wallcycle_start(wcycle, ewcNS);
1196         if (!DOMAINDECOMP(cr))
1197         {
1198             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1199             nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1200                               0, vzero, box_diag,
1201                               0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1202                               0, nullptr,
1203                               nbv->grp[eintLocal].kernel_type,
1204                               nbv->nbat);
1205             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1206         }
1207         else
1208         {
1209             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1210             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1211                                        fr->cginfo, as_rvec_array(x.data()),
1212                                        nbv->grp[eintNonlocal].kernel_type,
1213                                        nbv->nbat);
1214             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1215         }
1216
1217         nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1218         wallcycle_stop(wcycle, ewcNS);
1219     }
1220
1221     /* initialize the GPU atom data and copy shift vector */
1222     if (bUseGPU)
1223     {
1224         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1225         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1226
1227         if (bNS)
1228         {
1229             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1230         }
1231
1232         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1233
1234         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1235         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1236     }
1237
1238     /* do local pair search */
1239     if (bNS)
1240     {
1241         wallcycle_start_nocount(wcycle, ewcNS);
1242         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1243         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1244                             &top->excls,
1245                             nbv->listParams->rlistOuter,
1246                             nbv->min_ci_balanced,
1247                             &nbv->grp[eintLocal].nbl_lists,
1248                             eintLocal,
1249                             nbv->grp[eintLocal].kernel_type,
1250                             nrnb);
1251         nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1252         if (nbv->listParams->useDynamicPruning && !bUseGPU)
1253         {
1254             nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1255         }
1256         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1257
1258         if (bUseGPU)
1259         {
1260             /* initialize local pair-list on the GPU */
1261             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1262                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
1263                                     eintLocal);
1264         }
1265         wallcycle_stop(wcycle, ewcNS);
1266     }
1267     else
1268     {
1269         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1270                                         nbv->nbat, wcycle);
1271     }
1272
1273     if (bUseGPU)
1274     {
1275         if (DOMAINDECOMP(cr))
1276         {
1277             ddOpenBalanceRegionGpu(cr->dd);
1278         }
1279
1280         wallcycle_start(wcycle, ewcLAUNCH_GPU);
1281         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282         /* launch local nonbonded work on GPU */
1283         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1284                      step, nrnb, wcycle);
1285         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1286         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1287     }
1288
1289     if (useGpuPme)
1290     {
1291         // In PME GPU and mixed mode we launch FFT / gather after the
1292         // X copy/transform to allow overlap as well as after the GPU NB
1293         // launch to avoid FFT launch overhead hijacking the CPU and delaying
1294         // the nonbonded kernel.
1295         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1296     }
1297
1298     /* Communicate coordinates and sum dipole if necessary +
1299        do non-local pair search */
1300     if (DOMAINDECOMP(cr))
1301     {
1302         if (bNS)
1303         {
1304             wallcycle_start_nocount(wcycle, ewcNS);
1305             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1306
1307             nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1308                                 &top->excls,
1309                                 nbv->listParams->rlistOuter,
1310                                 nbv->min_ci_balanced,
1311                                 &nbv->grp[eintNonlocal].nbl_lists,
1312                                 eintNonlocal,
1313                                 nbv->grp[eintNonlocal].kernel_type,
1314                                 nrnb);
1315             nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1316             if (nbv->listParams->useDynamicPruning && !bUseGPU)
1317             {
1318                 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1319             }
1320             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1321
1322             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1323             {
1324                 /* initialize non-local pair-list on the GPU */
1325                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1326                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1327                                         eintNonlocal);
1328             }
1329             wallcycle_stop(wcycle, ewcNS);
1330         }
1331         else
1332         {
1333             dd_move_x(cr->dd, box, x, wcycle);
1334
1335             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1336                                             nbv->nbat, wcycle);
1337         }
1338
1339         if (bUseGPU)
1340         {
1341             wallcycle_start(wcycle, ewcLAUNCH_GPU);
1342             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1343             /* launch non-local nonbonded tasks on GPU */
1344             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1345                          step, nrnb, wcycle);
1346             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1347             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1348         }
1349     }
1350
1351     if (bUseGPU)
1352     {
1353         /* launch D2H copy-back F */
1354         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1355         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1356         if (DOMAINDECOMP(cr))
1357         {
1358             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1359                                      flags, eatNonlocal);
1360         }
1361         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1362                                  flags, eatLocal);
1363         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1364         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1365     }
1366
1367     if (bStateChanged && inputrecNeedMutot(inputrec))
1368     {
1369         if (PAR(cr))
1370         {
1371             gmx_sumd(2*DIM, mu, cr);
1372             ddReopenBalanceRegionCpu(cr->dd);
1373         }
1374
1375         for (i = 0; i < 2; i++)
1376         {
1377             for (j = 0; j < DIM; j++)
1378             {
1379                 fr->mu_tot[i][j] = mu[i*DIM + j];
1380             }
1381         }
1382     }
1383     if (fr->efep == efepNO)
1384     {
1385         copy_rvec(fr->mu_tot[0], mu_tot);
1386     }
1387     else
1388     {
1389         for (j = 0; j < DIM; j++)
1390         {
1391             mu_tot[j] =
1392                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1393                 lambda[efptCOUL]*fr->mu_tot[1][j];
1394         }
1395     }
1396
1397     /* Reset energies */
1398     reset_enerdata(enerd);
1399     clear_rvecs(SHIFTS, fr->fshift);
1400
1401     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1402     {
1403         wallcycle_start(wcycle, ewcPPDURINGPME);
1404         dd_force_flop_start(cr->dd, nrnb);
1405     }
1406
1407     if (inputrec->bRot)
1408     {
1409         wallcycle_start(wcycle, ewcROT);
1410         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1411         wallcycle_stop(wcycle, ewcROT);
1412     }
1413
1414     /* Temporary solution until all routines take PaddedRVecVector */
1415     rvec *const f = as_rvec_array(force.data());
1416
1417     /* Start the force cycle counter.
1418      * Note that a different counter is used for dynamic load balancing.
1419      */
1420     wallcycle_start(wcycle, ewcFORCE);
1421
1422     gmx::ArrayRef<gmx::RVec> forceRef = force;
1423     if (bDoForces)
1424     {
1425         /* If we need to compute the virial, we might need a separate
1426          * force buffer for algorithms for which the virial is calculated
1427          * directly, such as PME.
1428          */
1429         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1430         {
1431             forceRef = *fr->forceBufferForDirectVirialContributions;
1432
1433             /* TODO: update comment
1434              * We only compute forces on local atoms. Note that vsites can
1435              * spread to non-local atoms, but that part of the buffer is
1436              * cleared separately in the vsite spreading code.
1437              */
1438             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1439         }
1440         /* Clear the short- and long-range forces */
1441         clear_rvecs_omp(fr->natoms_force_constr, f);
1442     }
1443
1444     /* forceWithVirial uses the local atom range only */
1445     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1446
1447     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1448     {
1449         clear_pull_forces(inputrec->pull_work);
1450     }
1451
1452     /* We calculate the non-bonded forces, when done on the CPU, here.
1453      * We do this before calling do_force_lowlevel, because in that
1454      * function, the listed forces are calculated before PME, which
1455      * does communication.  With this order, non-bonded and listed
1456      * force calculation imbalance can be balanced out by the domain
1457      * decomposition load balancing.
1458      */
1459
1460     if (!bUseOrEmulGPU)
1461     {
1462         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1463                      step, nrnb, wcycle);
1464     }
1465
1466     if (fr->efep != efepNO)
1467     {
1468         /* Calculate the local and non-local free energy interactions here.
1469          * Happens here on the CPU both with and without GPU.
1470          */
1471         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1472         {
1473             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1474                              fr, as_rvec_array(x.data()), f, mdatoms,
1475                              inputrec->fepvals, lambda,
1476                              enerd, flags, nrnb, wcycle);
1477         }
1478
1479         if (DOMAINDECOMP(cr) &&
1480             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1481         {
1482             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1483                              fr, as_rvec_array(x.data()), f, mdatoms,
1484                              inputrec->fepvals, lambda,
1485                              enerd, flags, nrnb, wcycle);
1486         }
1487     }
1488
1489     if (!bUseOrEmulGPU)
1490     {
1491         int aloc;
1492
1493         if (DOMAINDECOMP(cr))
1494         {
1495             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1496                          step, nrnb, wcycle);
1497         }
1498
1499         if (!bUseOrEmulGPU)
1500         {
1501             aloc = eintLocal;
1502         }
1503         else
1504         {
1505             aloc = eintNonlocal;
1506         }
1507
1508         /* Add all the non-bonded force to the normal force array.
1509          * This can be split into a local and a non-local part when overlapping
1510          * communication with calculation with domain decomposition.
1511          */
1512         wallcycle_stop(wcycle, ewcFORCE);
1513
1514         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1515
1516         wallcycle_start_nocount(wcycle, ewcFORCE);
1517
1518         /* if there are multiple fshift output buffers reduce them */
1519         if ((flags & GMX_FORCE_VIRIAL) &&
1520             nbv->grp[aloc].nbl_lists.nnbl > 1)
1521         {
1522             /* This is not in a subcounter because it takes a
1523                negligible and constant-sized amount of time */
1524             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1525                                                      fr->fshift);
1526         }
1527     }
1528
1529     /* update QMMMrec, if necessary */
1530     if (fr->bQMMM)
1531     {
1532         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1533     }
1534
1535     /* Compute the bonded and non-bonded energies and optionally forces */
1536     do_force_lowlevel(fr, inputrec, &(top->idef),
1537                       cr, ms, nrnb, wcycle, mdatoms,
1538                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1539                       box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1540                       flags, &cycles_pme);
1541
1542     wallcycle_stop(wcycle, ewcFORCE);
1543
1544     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1545                          step, t, wcycle,
1546                          fr->forceProviders, box, x, mdatoms, lambda,
1547                          flags, &forceWithVirial, enerd,
1548                          ed, bNS);
1549
1550     if (bUseOrEmulGPU)
1551     {
1552         /* wait for non-local forces (or calculate in emulation mode) */
1553         if (DOMAINDECOMP(cr))
1554         {
1555             if (bUseGPU)
1556             {
1557                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1558                 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1559                                            flags, eatNonlocal,
1560                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1561                                            fr->fshift);
1562                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1563             }
1564             else
1565             {
1566                 wallcycle_start_nocount(wcycle, ewcFORCE);
1567                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1568                              step, nrnb, wcycle);
1569                 wallcycle_stop(wcycle, ewcFORCE);
1570             }
1571
1572             /* skip the reduction if there was no non-local work to do */
1573             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1574             {
1575                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1576                                                nbv->nbat, f, wcycle);
1577             }
1578         }
1579     }
1580
1581     if (DOMAINDECOMP(cr))
1582     {
1583         /* We are done with the CPU compute.
1584          * We will now communicate the non-local forces.
1585          * If we use a GPU this will overlap with GPU work, so in that case
1586          * we do not close the DD force balancing region here.
1587          */
1588         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1589         {
1590             ddCloseBalanceRegionCpu(cr->dd);
1591         }
1592         if (bDoForces)
1593         {
1594             dd_move_f(cr->dd, force, fr->fshift, wcycle);
1595         }
1596     }
1597
1598     // With both nonbonded and PME offloaded a GPU on the same rank, we use
1599     // an alternating wait/reduction scheme.
1600     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1601     if (alternateGpuWait)
1602     {
1603         alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1604     }
1605
1606     if (!alternateGpuWait && useGpuPme)
1607     {
1608         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1609         matrix vir_Q;
1610         real   Vlr_q = 0.0;
1611         pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1612         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1613     }
1614
1615     /* Wait for local GPU NB outputs on the non-alternating wait path */
1616     if (!alternateGpuWait && bUseGPU)
1617     {
1618         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1619          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1620          * but even with a step of 0.1 ms the difference is less than 1%
1621          * of the step time.
1622          */
1623         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1624
1625         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1626         nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1627                                    flags, eatLocal,
1628                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1629                                    fr->fshift);
1630         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1631
1632         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1633         {
1634             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1635             if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1636             {
1637                 /* We measured few cycles, it could be that the kernel
1638                  * and transfer finished earlier and there was no actual
1639                  * wait time, only API call overhead.
1640                  * Then the actual time could be anywhere between 0 and
1641                  * cycles_wait_est. We will use half of cycles_wait_est.
1642                  */
1643                 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1644             }
1645             ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1646         }
1647     }
1648
1649     if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1650     {
1651         // NOTE: emulation kernel is not included in the balancing region,
1652         // but emulation mode does not target performance anyway
1653         wallcycle_start_nocount(wcycle, ewcFORCE);
1654         do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1655                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1656                      step, nrnb, wcycle);
1657         wallcycle_stop(wcycle, ewcFORCE);
1658     }
1659
1660     if (useGpuPme)
1661     {
1662         pme_gpu_reinit_computation(fr->pmedata, wcycle);
1663     }
1664
1665     if (bUseGPU)
1666     {
1667         /* now clear the GPU outputs while we finish the step on the CPU */
1668         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1669         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1670         nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1671
1672         /* Is dynamic pair-list pruning activated? */
1673         if (nbv->listParams->useDynamicPruning)
1674         {
1675             launchGpuRollingPruning(cr, nbv, inputrec, step);
1676         }
1677         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1678         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1679     }
1680
1681     /* Do the nonbonded GPU (or emulation) force buffer reduction
1682      * on the non-alternating path. */
1683     if (bUseOrEmulGPU && !alternateGpuWait)
1684     {
1685         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1686                                        nbv->nbat, f, wcycle);
1687     }
1688
1689     if (DOMAINDECOMP(cr))
1690     {
1691         dd_force_flop_stop(cr->dd, nrnb);
1692     }
1693
1694     if (bDoForces)
1695     {
1696         /* If we have NoVirSum forces, but we do not calculate the virial,
1697          * we sum fr->f_novirsum=f later.
1698          */
1699         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1700         {
1701             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1702                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1703         }
1704
1705         if (flags & GMX_FORCE_VIRIAL)
1706         {
1707             /* Calculation of the virial must be done after vsites! */
1708             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1709                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1710         }
1711     }
1712
1713     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1714     {
1715         /* In case of node-splitting, the PP nodes receive the long-range
1716          * forces, virial and energy from the PME nodes here.
1717          */
1718         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1719     }
1720
1721     if (bDoForces)
1722     {
1723         post_process_forces(cr, step, nrnb, wcycle,
1724                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1725                             vir_force, mdatoms, graph, fr, vsite,
1726                             flags);
1727     }
1728
1729     if (flags & GMX_FORCE_ENERGY)
1730     {
1731         /* Sum the potential energy terms from group contributions */
1732         sum_epot(&(enerd->grpp), enerd->term);
1733
1734         if (!EI_TPI(inputrec->eI))
1735         {
1736             checkPotentialEnergyValidity(step, *enerd, *inputrec);
1737         }
1738     }
1739 }
1740
1741 static void do_force_cutsGROUP(FILE *fplog,
1742                                const t_commrec *cr,
1743                                const gmx_multisim_t *ms,
1744                                const t_inputrec *inputrec,
1745                                gmx::Awh *awh,
1746                                gmx_enfrot *enforcedRotation,
1747                                int64_t step,
1748                                t_nrnb *nrnb,
1749                                gmx_wallcycle_t wcycle,
1750                                gmx_localtop_t *top,
1751                                const gmx_groups_t *groups,
1752                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1753                                history_t *hist,
1754                                gmx::PaddedArrayRef<gmx::RVec> force,
1755                                tensor vir_force,
1756                                const t_mdatoms *mdatoms,
1757                                gmx_enerdata_t *enerd,
1758                                t_fcdata *fcd,
1759                                real *lambda,
1760                                t_graph *graph,
1761                                t_forcerec *fr,
1762                                const gmx_vsite_t *vsite,
1763                                rvec mu_tot,
1764                                double t,
1765                                gmx_edsam *ed,
1766                                int flags,
1767                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1768                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1769 {
1770     int        cg0, cg1, i, j;
1771     double     mu[2*DIM];
1772     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1773     gmx_bool   bDoForces;
1774     float      cycles_pme;
1775
1776     const int  start  = 0;
1777     const int  homenr = mdatoms->homenr;
1778
1779     clear_mat(vir_force);
1780
1781     cg0 = 0;
1782     if (DOMAINDECOMP(cr))
1783     {
1784         cg1 = cr->dd->globalAtomGroupIndices.size();
1785     }
1786     else
1787     {
1788         cg1 = top->cgs.nr;
1789     }
1790     if (fr->n_tpi > 0)
1791     {
1792         cg1--;
1793     }
1794
1795     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
1796     bNS            = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1797     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1798     bFillGrid      = (bNS && bStateChanged);
1799     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1800     bDoForces      = ((flags & GMX_FORCE_FORCES) != 0);
1801
1802     if (bStateChanged)
1803     {
1804         update_forcerec(fr, box);
1805
1806         if (inputrecNeedMutot(inputrec))
1807         {
1808             /* Calculate total (local) dipole moment in a temporary common array.
1809              * This makes it possible to sum them over nodes faster.
1810              */
1811             calc_mu(start, homenr,
1812                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1813                     mu, mu+DIM);
1814         }
1815     }
1816
1817     if (fr->ePBC != epbcNONE)
1818     {
1819         /* Compute shift vectors every step,
1820          * because of pressure coupling or box deformation!
1821          */
1822         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1823         {
1824             calc_shifts(box, fr->shift_vec);
1825         }
1826
1827         if (bCalcCGCM)
1828         {
1829             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1830                                      &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1831             inc_nrnb(nrnb, eNR_CGCM, homenr);
1832             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1833         }
1834         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1835         {
1836             unshift_self(graph, box, as_rvec_array(x.data()));
1837         }
1838     }
1839     else if (bCalcCGCM)
1840     {
1841         calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1842         inc_nrnb(nrnb, eNR_CGCM, homenr);
1843     }
1844
1845     if (bCalcCGCM && gmx_debug_at)
1846     {
1847         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1848     }
1849
1850 #if GMX_MPI
1851     if (!thisRankHasDuty(cr, DUTY_PME))
1852     {
1853         /* Send particle coordinates to the pme nodes.
1854          * Since this is only implemented for domain decomposition
1855          * and domain decomposition does not use the graph,
1856          * we do not need to worry about shifting.
1857          */
1858         gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1859                                  lambda[efptCOUL], lambda[efptVDW],
1860                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1861                                  step, wcycle);
1862     }
1863 #endif /* GMX_MPI */
1864
1865     /* Communicate coordinates and sum dipole if necessary */
1866     if (DOMAINDECOMP(cr))
1867     {
1868         dd_move_x(cr->dd, box, x, wcycle);
1869
1870         /* No GPU support, no move_x overlap, so reopen the balance region here */
1871         if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1872         {
1873             ddReopenBalanceRegionCpu(cr->dd);
1874         }
1875     }
1876
1877     if (inputrecNeedMutot(inputrec))
1878     {
1879         if (bStateChanged)
1880         {
1881             if (PAR(cr))
1882             {
1883                 gmx_sumd(2*DIM, mu, cr);
1884                 ddReopenBalanceRegionCpu(cr->dd);
1885             }
1886             for (i = 0; i < 2; i++)
1887             {
1888                 for (j = 0; j < DIM; j++)
1889                 {
1890                     fr->mu_tot[i][j] = mu[i*DIM + j];
1891                 }
1892             }
1893         }
1894         if (fr->efep == efepNO)
1895         {
1896             copy_rvec(fr->mu_tot[0], mu_tot);
1897         }
1898         else
1899         {
1900             for (j = 0; j < DIM; j++)
1901             {
1902                 mu_tot[j] =
1903                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1904             }
1905         }
1906     }
1907
1908     /* Reset energies */
1909     reset_enerdata(enerd);
1910     clear_rvecs(SHIFTS, fr->fshift);
1911
1912     if (bNS)
1913     {
1914         wallcycle_start(wcycle, ewcNS);
1915
1916         if (graph && bStateChanged)
1917         {
1918             /* Calculate intramolecular shift vectors to make molecules whole */
1919             mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1920         }
1921
1922         /* Do the actual neighbour searching */
1923         ns(fplog, fr, box,
1924            groups, top, mdatoms,
1925            cr, nrnb, bFillGrid);
1926
1927         wallcycle_stop(wcycle, ewcNS);
1928     }
1929
1930     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1931     {
1932         wallcycle_start(wcycle, ewcPPDURINGPME);
1933         dd_force_flop_start(cr->dd, nrnb);
1934     }
1935
1936     if (inputrec->bRot)
1937     {
1938         wallcycle_start(wcycle, ewcROT);
1939         do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1940         wallcycle_stop(wcycle, ewcROT);
1941     }
1942
1943     /* Temporary solution until all routines take PaddedRVecVector */
1944     rvec *f = as_rvec_array(force.data());
1945
1946     /* Start the force cycle counter.
1947      * Note that a different counter is used for dynamic load balancing.
1948      */
1949     wallcycle_start(wcycle, ewcFORCE);
1950
1951     gmx::ArrayRef<gmx::RVec> forceRef = force;
1952     if (bDoForces)
1953     {
1954         /* If we need to compute the virial, we might need a separate
1955          * force buffer for algorithms for which the virial is calculated
1956          * separately, such as PME.
1957          */
1958         if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1959         {
1960             forceRef = *fr->forceBufferForDirectVirialContributions;
1961
1962             clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1963         }
1964
1965         /* Clear the short- and long-range forces */
1966         clear_rvecs(fr->natoms_force_constr, f);
1967     }
1968
1969     /* forceWithVirial might need the full force atom range */
1970     gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1971
1972     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1973     {
1974         clear_pull_forces(inputrec->pull_work);
1975     }
1976
1977     /* update QMMMrec, if necessary */
1978     if (fr->bQMMM)
1979     {
1980         update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1981     }
1982
1983     /* Compute the bonded and non-bonded energies and optionally forces */
1984     do_force_lowlevel(fr, inputrec, &(top->idef),
1985                       cr, ms, nrnb, wcycle, mdatoms,
1986                       as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1987                       box, inputrec->fepvals, lambda,
1988                       graph, &(top->excls), fr->mu_tot,
1989                       flags,
1990                       &cycles_pme);
1991
1992     wallcycle_stop(wcycle, ewcFORCE);
1993
1994     if (DOMAINDECOMP(cr))
1995     {
1996         dd_force_flop_stop(cr->dd, nrnb);
1997
1998         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1999         {
2000             ddCloseBalanceRegionCpu(cr->dd);
2001         }
2002     }
2003
2004     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2005                          step, t, wcycle,
2006                          fr->forceProviders, box, x, mdatoms, lambda,
2007                          flags, &forceWithVirial, enerd,
2008                          ed, bNS);
2009
2010     if (bDoForces)
2011     {
2012         /* Communicate the forces */
2013         if (DOMAINDECOMP(cr))
2014         {
2015             dd_move_f(cr->dd, force, fr->fshift, wcycle);
2016             /* Do we need to communicate the separate force array
2017              * for terms that do not contribute to the single sum virial?
2018              * Position restraints and electric fields do not introduce
2019              * inter-cg forces, only full electrostatics methods do.
2020              * When we do not calculate the virial, fr->f_novirsum = f,
2021              * so we have already communicated these forces.
2022              */
2023             if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2024                 (flags & GMX_FORCE_VIRIAL))
2025             {
2026                 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2027             }
2028         }
2029
2030         /* If we have NoVirSum forces, but we do not calculate the virial,
2031          * we sum fr->f_novirsum=f later.
2032          */
2033         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2034         {
2035             spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2036                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2037         }
2038
2039         if (flags & GMX_FORCE_VIRIAL)
2040         {
2041             /* Calculation of the virial must be done after vsites! */
2042             calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2043                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2044         }
2045     }
2046
2047     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2048     {
2049         /* In case of node-splitting, the PP nodes receive the long-range
2050          * forces, virial and energy from the PME nodes here.
2051          */
2052         pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2053     }
2054
2055     if (bDoForces)
2056     {
2057         post_process_forces(cr, step, nrnb, wcycle,
2058                             top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2059                             vir_force, mdatoms, graph, fr, vsite,
2060                             flags);
2061     }
2062
2063     if (flags & GMX_FORCE_ENERGY)
2064     {
2065         /* Sum the potential energy terms from group contributions */
2066         sum_epot(&(enerd->grpp), enerd->term);
2067
2068         if (!EI_TPI(inputrec->eI))
2069         {
2070             checkPotentialEnergyValidity(step, *enerd, *inputrec);
2071         }
2072     }
2073
2074 }
2075
2076 void do_force(FILE                                     *fplog,
2077               const t_commrec                          *cr,
2078               const gmx_multisim_t                     *ms,
2079               const t_inputrec                         *inputrec,
2080               gmx::Awh                                 *awh,
2081               gmx_enfrot                               *enforcedRotation,
2082               int64_t                                   step,
2083               t_nrnb                                   *nrnb,
2084               gmx_wallcycle_t                           wcycle,
2085               gmx_localtop_t                           *top,
2086               const gmx_groups_t                       *groups,
2087               matrix                                    box,
2088               gmx::PaddedArrayRef<gmx::RVec>            x,
2089               history_t                                *hist,
2090               gmx::PaddedArrayRef<gmx::RVec>            force,
2091               tensor                                    vir_force,
2092               const t_mdatoms                          *mdatoms,
2093               gmx_enerdata_t                           *enerd,
2094               t_fcdata                                 *fcd,
2095               gmx::ArrayRef<real>                       lambda,
2096               t_graph                                  *graph,
2097               t_forcerec                               *fr,
2098               const gmx_vsite_t                        *vsite,
2099               rvec                                      mu_tot,
2100               double                                    t,
2101               gmx_edsam                                *ed,
2102               int                                       flags,
2103               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2104               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2105 {
2106     /* modify force flag if not doing nonbonded */
2107     if (!fr->bNonbonded)
2108     {
2109         flags &= ~GMX_FORCE_NONBONDED;
2110     }
2111
2112     GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2113     GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2114
2115     switch (inputrec->cutoff_scheme)
2116     {
2117         case ecutsVERLET:
2118             do_force_cutsVERLET(fplog, cr, ms, inputrec,
2119                                 awh, enforcedRotation, step, nrnb, wcycle,
2120                                 top,
2121                                 groups,
2122                                 box, x, hist,
2123                                 force, vir_force,
2124                                 mdatoms,
2125                                 enerd, fcd,
2126                                 lambda.data(), graph,
2127                                 fr, fr->ic,
2128                                 vsite, mu_tot,
2129                                 t, ed,
2130                                 flags,
2131                                 ddOpenBalanceRegion,
2132                                 ddCloseBalanceRegion);
2133             break;
2134         case ecutsGROUP:
2135             do_force_cutsGROUP(fplog, cr, ms, inputrec,
2136                                awh, enforcedRotation, step, nrnb, wcycle,
2137                                top,
2138                                groups,
2139                                box, x, hist,
2140                                force, vir_force,
2141                                mdatoms,
2142                                enerd, fcd,
2143                                lambda.data(), graph,
2144                                fr, vsite, mu_tot,
2145                                t, ed,
2146                                flags,
2147                                ddOpenBalanceRegion,
2148                                ddCloseBalanceRegion);
2149             break;
2150         default:
2151             gmx_incons("Invalid cut-off scheme passed!");
2152     }
2153
2154     /* In case we don't have constraints and are using GPUs, the next balancing
2155      * region starts here.
2156      * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2157      * virial calculation and COM pulling, is not thus not included in
2158      * the balance timing, which is ok as most tasks do communication.
2159      */
2160     if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2161     {
2162         ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2163     }
2164 }
2165
2166
2167 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2168                         const t_inputrec *ir, const t_mdatoms *md,
2169                         t_state *state)
2170 {
2171     int             i, m, start, end;
2172     int64_t         step;
2173     real            dt = ir->delta_t;
2174     real            dvdl_dum;
2175     rvec           *savex;
2176
2177     /* We need to allocate one element extra, since we might use
2178      * (unaligned) 4-wide SIMD loads to access rvec entries.
2179      */
2180     snew(savex, state->natoms + 1);
2181
2182     start = 0;
2183     end   = md->homenr;
2184
2185     if (debug)
2186     {
2187         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2188                 start, md->homenr, end);
2189     }
2190     /* Do a first constrain to reset particles... */
2191     step = ir->init_step;
2192     if (fplog)
2193     {
2194         char buf[STEPSTRSIZE];
2195         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2196                 gmx_step_str(step, buf));
2197     }
2198     dvdl_dum = 0;
2199
2200     /* constrain the current position */
2201     constr->apply(TRUE, FALSE,
2202                   step, 0, 1.0,
2203                   as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2204                   state->box,
2205                   state->lambda[efptBONDED], &dvdl_dum,
2206                   nullptr, nullptr, gmx::ConstraintVariable::Positions);
2207     if (EI_VV(ir->eI))
2208     {
2209         /* constrain the inital velocity, and save it */
2210         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2211         constr->apply(TRUE, FALSE,
2212                       step, 0, 1.0,
2213                       as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2214                       state->box,
2215                       state->lambda[efptBONDED], &dvdl_dum,
2216                       nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2217     }
2218     /* constrain the inital velocities at t-dt/2 */
2219     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2220     {
2221         for (i = start; (i < end); i++)
2222         {
2223             for (m = 0; (m < DIM); m++)
2224             {
2225                 /* Reverse the velocity */
2226                 state->v[i][m] = -state->v[i][m];
2227                 /* Store the position at t-dt in buf */
2228                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2229             }
2230         }
2231         /* Shake the positions at t=-dt with the positions at t=0
2232          * as reference coordinates.
2233          */
2234         if (fplog)
2235         {
2236             char buf[STEPSTRSIZE];
2237             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2238                     gmx_step_str(step, buf));
2239         }
2240         dvdl_dum = 0;
2241         constr->apply(TRUE, FALSE,
2242                       step, -1, 1.0,
2243                       as_rvec_array(state->x.data()), savex, nullptr,
2244                       state->box,
2245                       state->lambda[efptBONDED], &dvdl_dum,
2246                       as_rvec_array(state->v.data()), nullptr, gmx::ConstraintVariable::Positions);
2247
2248         for (i = start; i < end; i++)
2249         {
2250             for (m = 0; m < DIM; m++)
2251             {
2252                 /* Re-reverse the velocities */
2253                 state->v[i][m] = -state->v[i][m];
2254             }
2255         }
2256     }
2257     sfree(savex);
2258 }
2259
2260
2261 static void
2262 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2263                 double *enerout, double *virout)
2264 {
2265     double enersum, virsum;
2266     double invscale, invscale2, invscale3;
2267     double r, ea, eb, ec, pa, pb, pc, pd;
2268     double y0, f, g, h;
2269     int    ri, offset;
2270     double tabfactor;
2271
2272     invscale  = 1.0/scale;
2273     invscale2 = invscale*invscale;
2274     invscale3 = invscale*invscale2;
2275
2276     /* Following summation derived from cubic spline definition,
2277      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2278      * the cubic spline.  We first calculate the negative of the
2279      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2280      * add the more standard, abrupt cutoff correction to that result,
2281      * yielding the long-range correction for a switched function.  We
2282      * perform both the pressure and energy loops at the same time for
2283      * simplicity, as the computational cost is low. */
2284
2285     if (offstart == 0)
2286     {
2287         /* Since the dispersion table has been scaled down a factor
2288          * 6.0 and the repulsion a factor 12.0 to compensate for the
2289          * c6/c12 parameters inside nbfp[] being scaled up (to save
2290          * flops in kernels), we need to correct for this.
2291          */
2292         tabfactor = 6.0;
2293     }
2294     else
2295     {
2296         tabfactor = 12.0;
2297     }
2298
2299     enersum = 0.0;
2300     virsum  = 0.0;
2301     for (ri = rstart; ri < rend; ++ri)
2302     {
2303         r  = ri*invscale;
2304         ea = invscale3;
2305         eb = 2.0*invscale2*r;
2306         ec = invscale*r*r;
2307
2308         pa = invscale3;
2309         pb = 3.0*invscale2*r;
2310         pc = 3.0*invscale*r*r;
2311         pd = r*r*r;
2312
2313         /* this "8" is from the packing in the vdwtab array - perhaps
2314            should be defined? */
2315
2316         offset = 8*ri + offstart;
2317         y0     = vdwtab[offset];
2318         f      = vdwtab[offset+1];
2319         g      = vdwtab[offset+2];
2320         h      = vdwtab[offset+3];
2321
2322         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2323         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2324     }
2325     *enerout = 4.0*M_PI*enersum*tabfactor;
2326     *virout  = 4.0*M_PI*virsum*tabfactor;
2327 }
2328
2329 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2330 {
2331     double   eners[2], virs[2], enersum, virsum;
2332     double   r0, rc3, rc9;
2333     int      ri0, ri1, i;
2334     real     scale, *vdwtab;
2335
2336     fr->enershiftsix    = 0;
2337     fr->enershifttwelve = 0;
2338     fr->enerdiffsix     = 0;
2339     fr->enerdifftwelve  = 0;
2340     fr->virdiffsix      = 0;
2341     fr->virdifftwelve   = 0;
2342
2343     const interaction_const_t *ic = fr->ic;
2344
2345     if (eDispCorr != edispcNO)
2346     {
2347         for (i = 0; i < 2; i++)
2348         {
2349             eners[i] = 0;
2350             virs[i]  = 0;
2351         }
2352         if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2353             (ic->vdw_modifier == eintmodPOTSWITCH) ||
2354             (ic->vdw_modifier == eintmodFORCESWITCH) ||
2355             (ic->vdwtype == evdwSHIFT) ||
2356             (ic->vdwtype == evdwSWITCH))
2357         {
2358             if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2359                  (ic->vdw_modifier == eintmodFORCESWITCH) ||
2360                  (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2361             {
2362                 gmx_fatal(FARGS,
2363                           "With dispersion correction rvdw-switch can not be zero "
2364                           "for vdw-type = %s", evdw_names[ic->vdwtype]);
2365             }
2366
2367             /* TODO This code depends on the logic in tables.c that
2368                constructs the table layout, which should be made
2369                explicit in future cleanup. */
2370             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2371                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2372             scale  = fr->dispersionCorrectionTable->scale;
2373             vdwtab = fr->dispersionCorrectionTable->data;
2374
2375             /* Round the cut-offs to exact table values for precision */
2376             ri0  = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2377             ri1  = static_cast<int>(std::ceil(ic->rvdw*scale));
2378
2379             /* The code below has some support for handling force-switching, i.e.
2380              * when the force (instead of potential) is switched over a limited
2381              * region. This leads to a constant shift in the potential inside the
2382              * switching region, which we can handle by adding a constant energy
2383              * term in the force-switch case just like when we do potential-shift.
2384              *
2385              * For now this is not enabled, but to keep the functionality in the
2386              * code we check separately for switch and shift. When we do force-switch
2387              * the shifting point is rvdw_switch, while it is the cutoff when we
2388              * have a classical potential-shift.
2389              *
2390              * For a pure potential-shift the potential has a constant shift
2391              * all the way out to the cutoff, and that is it. For other forms
2392              * we need to calculate the constant shift up to the point where we
2393              * start modifying the potential.
2394              */
2395             ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2396
2397             r0   = ri0/scale;
2398             rc3  = r0*r0*r0;
2399             rc9  = rc3*rc3*rc3;
2400
2401             if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2402                 (ic->vdwtype == evdwSHIFT))
2403             {
2404                 /* Determine the constant energy shift below rvdw_switch.
2405                  * Table has a scale factor since we have scaled it down to compensate
2406                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2407                  */
2408                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2409                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2410             }
2411             else if (ic->vdw_modifier == eintmodPOTSHIFT)
2412             {
2413                 fr->enershiftsix    = static_cast<real>(-1.0/(rc3*rc3));
2414                 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2415             }
2416
2417             /* Add the constant part from 0 to rvdw_switch.
2418              * This integration from 0 to rvdw_switch overcounts the number
2419              * of interactions by 1, as it also counts the self interaction.
2420              * We will correct for this later.
2421              */
2422             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2423             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2424
2425             /* Calculate the contribution in the range [r0,r1] where we
2426              * modify the potential. For a pure potential-shift modifier we will
2427              * have ri0==ri1, and there will not be any contribution here.
2428              */
2429             for (i = 0; i < 2; i++)
2430             {
2431                 enersum = 0;
2432                 virsum  = 0;
2433                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2434                 eners[i] -= enersum;
2435                 virs[i]  -= virsum;
2436             }
2437
2438             /* Alright: Above we compensated by REMOVING the parts outside r0
2439              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2440              *
2441              * Regardless of whether r0 is the point where we start switching,
2442              * or the cutoff where we calculated the constant shift, we include
2443              * all the parts we are missing out to infinity from r0 by
2444              * calculating the analytical dispersion correction.
2445              */
2446             eners[0] += -4.0*M_PI/(3.0*rc3);
2447             eners[1] +=  4.0*M_PI/(9.0*rc9);
2448             virs[0]  +=  8.0*M_PI/rc3;
2449             virs[1]  += -16.0*M_PI/(3.0*rc9);
2450         }
2451         else if (ic->vdwtype == evdwCUT ||
2452                  EVDW_PME(ic->vdwtype) ||
2453                  ic->vdwtype == evdwUSER)
2454         {
2455             if (ic->vdwtype == evdwUSER && fplog)
2456             {
2457                 fprintf(fplog,
2458                         "WARNING: using dispersion correction with user tables\n");
2459             }
2460
2461             /* Note that with LJ-PME, the dispersion correction is multiplied
2462              * by the difference between the actual C6 and the value of C6
2463              * that would produce the combination rule.
2464              * This means the normal energy and virial difference formulas
2465              * can be used here.
2466              */
2467
2468             rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
2469             rc9  = rc3*rc3*rc3;
2470             /* Contribution beyond the cut-off */
2471             eners[0] += -4.0*M_PI/(3.0*rc3);
2472             eners[1] +=  4.0*M_PI/(9.0*rc9);
2473             if (ic->vdw_modifier == eintmodPOTSHIFT)
2474             {
2475                 /* Contribution within the cut-off */
2476                 eners[0] += -4.0*M_PI/(3.0*rc3);
2477                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2478             }
2479             /* Contribution beyond the cut-off */
2480             virs[0]  +=  8.0*M_PI/rc3;
2481             virs[1]  += -16.0*M_PI/(3.0*rc9);
2482         }
2483         else
2484         {
2485             gmx_fatal(FARGS,
2486                       "Dispersion correction is not implemented for vdw-type = %s",
2487                       evdw_names[ic->vdwtype]);
2488         }
2489
2490         /* When we deprecate the group kernels the code below can go too */
2491         if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2492         {
2493             /* Calculate self-interaction coefficient (assuming that
2494              * the reciprocal-space contribution is constant in the
2495              * region that contributes to the self-interaction).
2496              */
2497             fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2498
2499             eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2500             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2501         }
2502
2503         fr->enerdiffsix    = eners[0];
2504         fr->enerdifftwelve = eners[1];
2505         /* The 0.5 is due to the Gromacs definition of the virial */
2506         fr->virdiffsix     = 0.5*virs[0];
2507         fr->virdifftwelve  = 0.5*virs[1];
2508     }
2509 }
2510
2511 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2512                    const matrix box, real lambda, tensor pres, tensor virial,
2513                    real *prescorr, real *enercorr, real *dvdlcorr)
2514 {
2515     gmx_bool bCorrAll, bCorrPres;
2516     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2517     int      m;
2518
2519     *prescorr = 0;
2520     *enercorr = 0;
2521     *dvdlcorr = 0;
2522
2523     clear_mat(virial);
2524     clear_mat(pres);
2525
2526     if (ir->eDispCorr != edispcNO)
2527     {
2528         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2529                      ir->eDispCorr == edispcAllEnerPres);
2530         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2531                      ir->eDispCorr == edispcAllEnerPres);
2532
2533         invvol = 1/det(box);
2534         if (fr->n_tpi)
2535         {
2536             /* Only correct for the interactions with the inserted molecule */
2537             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2538             ninter = fr->n_tpi;
2539         }
2540         else
2541         {
2542             dens   = fr->numAtomsForDispersionCorrection*invvol;
2543             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2544         }
2545
2546         if (ir->efep == efepNO)
2547         {
2548             avcsix    = fr->avcsix[0];
2549             avctwelve = fr->avctwelve[0];
2550         }
2551         else
2552         {
2553             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2554             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2555         }
2556
2557         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2558         *enercorr += avcsix*enerdiff;
2559         dvdlambda  = 0.0;
2560         if (ir->efep != efepNO)
2561         {
2562             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2563         }
2564         if (bCorrAll)
2565         {
2566             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2567             *enercorr += avctwelve*enerdiff;
2568             if (fr->efep != efepNO)
2569             {
2570                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2571             }
2572         }
2573
2574         if (bCorrPres)
2575         {
2576             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2577             if (ir->eDispCorr == edispcAllEnerPres)
2578             {
2579                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2580             }
2581             /* The factor 2 is because of the Gromacs virial definition */
2582             spres = -2.0*invvol*svir*PRESFAC;
2583
2584             for (m = 0; m < DIM; m++)
2585             {
2586                 virial[m][m] += svir;
2587                 pres[m][m]   += spres;
2588             }
2589             *prescorr += spres;
2590         }
2591
2592         /* Can't currently control when it prints, for now, just print when degugging */
2593         if (debug)
2594         {
2595             if (bCorrAll)
2596             {
2597                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2598                         avcsix, avctwelve);
2599             }
2600             if (bCorrPres)
2601             {
2602                 fprintf(debug,
2603                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2604                         *enercorr, spres, svir);
2605             }
2606             else
2607             {
2608                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2609             }
2610         }
2611
2612         if (fr->efep != efepNO)
2613         {
2614             *dvdlcorr += dvdlambda;
2615         }
2616     }
2617 }
2618
2619 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2620                             const gmx_mtop_t *mtop, rvec x[],
2621                             gmx_bool bFirst)
2622 {
2623     t_graph        *graph;
2624     int             as, mol;
2625
2626     if (bFirst && fplog)
2627     {
2628         fprintf(fplog, "Removing pbc first time\n");
2629     }
2630
2631     snew(graph, 1);
2632     as = 0;
2633     for (const gmx_molblock_t &molb : mtop->molblock)
2634     {
2635         const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2636         if (moltype.atoms.nr == 1 ||
2637             (!bFirst && moltype.cgs.nr == 1))
2638         {
2639             /* Just one atom or charge group in the molecule, no PBC required */
2640             as += molb.nmol*moltype.atoms.nr;
2641         }
2642         else
2643         {
2644             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2645             mk_graph_ilist(nullptr, moltype.ilist,
2646                            0, moltype.atoms.nr, FALSE, FALSE, graph);
2647
2648             for (mol = 0; mol < molb.nmol; mol++)
2649             {
2650                 mk_mshift(fplog, graph, ePBC, box, x+as);
2651
2652                 shift_self(graph, box, x+as);
2653                 /* The molecule is whole now.
2654                  * We don't need the second mk_mshift call as in do_pbc_first,
2655                  * since we no longer need this graph.
2656                  */
2657
2658                 as += moltype.atoms.nr;
2659             }
2660             done_graph(graph);
2661         }
2662     }
2663     sfree(graph);
2664 }
2665
2666 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2667                        const gmx_mtop_t *mtop, rvec x[])
2668 {
2669     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2670 }
2671
2672 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2673                  const gmx_mtop_t *mtop, rvec x[])
2674 {
2675     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2676 }
2677
2678 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2679 {
2680     int t, nth;
2681     nth = gmx_omp_nthreads_get(emntDefault);
2682
2683 #pragma omp parallel for num_threads(nth) schedule(static)
2684     for (t = 0; t < nth; t++)
2685     {
2686         try
2687         {
2688             size_t natoms = x.size();
2689             size_t offset = (natoms*t    )/nth;
2690             size_t len    = (natoms*(t + 1))/nth - offset;
2691             put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2692         }
2693         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2694     }
2695 }
2696
2697 // TODO This can be cleaned up a lot, and move back to runner.cpp
2698 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2699                 const t_inputrec *inputrec,
2700                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2701                 gmx_walltime_accounting_t walltime_accounting,
2702                 nonbonded_verlet_t *nbv,
2703                 const gmx_pme_t *pme,
2704                 gmx_bool bWriteStat)
2705 {
2706     t_nrnb *nrnb_tot = nullptr;
2707     double  delta_t  = 0;
2708     double  nbfs     = 0, mflop = 0;
2709     double  elapsed_time,
2710             elapsed_time_over_all_ranks,
2711             elapsed_time_over_all_threads,
2712             elapsed_time_over_all_threads_over_all_ranks;
2713     /* Control whether it is valid to print a report. Only the
2714        simulation master may print, but it should not do so if the run
2715        terminated e.g. before a scheduled reset step. This is
2716        complicated by the fact that PME ranks are unaware of the
2717        reason why they were sent a pmerecvqxFINISH. To avoid
2718        communication deadlocks, we always do the communication for the
2719        report, even if we've decided not to write the report, because
2720        how long it takes to finish the run is not important when we've
2721        decided not to report on the simulation performance.
2722
2723        Further, we only report performance for dynamical integrators,
2724        because those are the only ones for which we plan to
2725        consider doing any optimizations. */
2726     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2727
2728     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2729     {
2730         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2731         printReport = false;
2732     }
2733
2734     if (cr->nnodes > 1)
2735     {
2736         snew(nrnb_tot, 1);
2737 #if GMX_MPI
2738         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2739                       cr->mpi_comm_mysim);
2740 #endif
2741     }
2742     else
2743     {
2744         nrnb_tot = nrnb;
2745     }
2746
2747     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
2748     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2749     if (cr->nnodes > 1)
2750     {
2751 #if GMX_MPI
2752         /* reduce elapsed_time over all MPI ranks in the current simulation */
2753         MPI_Allreduce(&elapsed_time,
2754                       &elapsed_time_over_all_ranks,
2755                       1, MPI_DOUBLE, MPI_SUM,
2756                       cr->mpi_comm_mysim);
2757         elapsed_time_over_all_ranks /= cr->nnodes;
2758         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2759          * current simulation. */
2760         MPI_Allreduce(&elapsed_time_over_all_threads,
2761                       &elapsed_time_over_all_threads_over_all_ranks,
2762                       1, MPI_DOUBLE, MPI_SUM,
2763                       cr->mpi_comm_mysim);
2764 #endif
2765     }
2766     else
2767     {
2768         elapsed_time_over_all_ranks                  = elapsed_time;
2769         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2770     }
2771
2772     if (printReport)
2773     {
2774         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2775     }
2776     if (cr->nnodes > 1)
2777     {
2778         sfree(nrnb_tot);
2779     }
2780
2781     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2782     {
2783         print_dd_statistics(cr, inputrec, fplog);
2784     }
2785
2786     /* TODO Move the responsibility for any scaling by thread counts
2787      * to the code that handled the thread region, so that there's a
2788      * mechanism to keep cycle counting working during the transition
2789      * to task parallelism. */
2790     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2791     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2792     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2793     auto cycle_sum(wallcycle_sum(cr, wcycle));
2794
2795     if (printReport)
2796     {
2797         auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2798         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
2799         if (pme_gpu_task_enabled(pme))
2800         {
2801             pme_gpu_get_timings(pme, &pme_gpu_timings);
2802         }
2803         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2804                         elapsed_time_over_all_ranks,
2805                         wcycle, cycle_sum,
2806                         nbnxn_gpu_timings,
2807                         &pme_gpu_timings);
2808
2809         if (EI_DYNAMICS(inputrec->eI))
2810         {
2811             delta_t = inputrec->delta_t;
2812         }
2813
2814         if (fplog)
2815         {
2816             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2817                        elapsed_time_over_all_ranks,
2818                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2819                        delta_t, nbfs, mflop);
2820         }
2821         if (bWriteStat)
2822         {
2823             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2824                        elapsed_time_over_all_ranks,
2825                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2826                        delta_t, nbfs, mflop);
2827         }
2828     }
2829 }
2830
2831 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2832 {
2833     /* this function works, but could probably use a logic rewrite to keep all the different
2834        types of efep straight. */
2835
2836     if ((ir->efep == efepNO) && (!ir->bSimTemp))
2837     {
2838         return;
2839     }
2840
2841     t_lambda *fep = ir->fepvals;
2842     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2843                                             if checkpoint is set -- a kludge is in for now
2844                                             to prevent this.*/
2845
2846     for (int i = 0; i < efptNR; i++)
2847     {
2848         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2849         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2850         {
2851             lambda[i] = fep->init_lambda;
2852             if (lam0)
2853             {
2854                 lam0[i] = lambda[i];
2855             }
2856         }
2857         else
2858         {
2859             lambda[i] = fep->all_lambda[i][*fep_state];
2860             if (lam0)
2861             {
2862                 lam0[i] = lambda[i];
2863             }
2864         }
2865     }
2866     if (ir->bSimTemp)
2867     {
2868         /* need to rescale control temperatures to match current state */
2869         for (int i = 0; i < ir->opts.ngtc; i++)
2870         {
2871             if (ir->opts.ref_t[i] > 0)
2872             {
2873                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2874             }
2875         }
2876     }
2877
2878     /* Send to the log the information on the current lambdas */
2879     if (fplog != nullptr)
2880     {
2881         fprintf(fplog, "Initial vector of lambda components:[ ");
2882         for (const auto &l : lambda)
2883         {
2884             fprintf(fplog, "%10.4f ", l);
2885         }
2886         fprintf(fplog, "]\n");
2887     }
2888 }
2889
2890
2891 void init_md(FILE *fplog,
2892              const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2893              t_inputrec *ir, const gmx_output_env_t *oenv,
2894              const MdrunOptions &mdrunOptions,
2895              double *t, double *t0,
2896              t_state *globalState, double *lam0,
2897              t_nrnb *nrnb, gmx_mtop_t *mtop,
2898              gmx_update_t **upd,
2899              gmx::BoxDeformation *deform,
2900              int nfile, const t_filenm fnm[],
2901              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2902              tensor force_vir, tensor shake_vir,
2903              tensor total_vir, tensor pres, rvec mu_tot,
2904              gmx_bool *bSimAnn, t_vcm **vcm,
2905              gmx_wallcycle_t wcycle)
2906 {
2907     int  i;
2908
2909     /* Initial values */
2910     *t = *t0       = ir->init_t;
2911
2912     *bSimAnn = FALSE;
2913     for (i = 0; i < ir->opts.ngtc; i++)
2914     {
2915         /* set bSimAnn if any group is being annealed */
2916         if (ir->opts.annealing[i] != eannNO)
2917         {
2918             *bSimAnn = TRUE;
2919         }
2920     }
2921
2922     /* Initialize lambda variables */
2923     /* TODO: Clean up initialization of fep_state and lambda in t_state.
2924      * We currently need to call initialize_lambdas on non-master ranks
2925      * to initialize lam0.
2926      */
2927     if (MASTER(cr))
2928     {
2929         initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2930     }
2931     else
2932     {
2933         int                      tmpFepState;
2934         std::array<real, efptNR> tmpLambda;
2935         initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2936     }
2937
2938     // TODO upd is never NULL in practice, but the analysers don't know that
2939     if (upd)
2940     {
2941         *upd = init_update(ir, deform);
2942     }
2943     if (*bSimAnn)
2944     {
2945         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2946     }
2947
2948     if (vcm != nullptr)
2949     {
2950         *vcm = init_vcm(fplog, &mtop->groups, ir);
2951     }
2952
2953     if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2954     {
2955         if (ir->etc == etcBERENDSEN)
2956         {
2957             please_cite(fplog, "Berendsen84a");
2958         }
2959         if (ir->etc == etcVRESCALE)
2960         {
2961             please_cite(fplog, "Bussi2007a");
2962         }
2963         if (ir->eI == eiSD1)
2964         {
2965             please_cite(fplog, "Goga2012");
2966         }
2967     }
2968     init_nrnb(nrnb);
2969
2970     if (nfile != -1)
2971     {
2972         *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2973
2974         *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2975                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2976     }
2977
2978     /* Initiate variables */
2979     clear_mat(force_vir);
2980     clear_mat(shake_vir);
2981     clear_rvec(mu_tot);
2982     clear_mat(total_vir);
2983     clear_mat(pres);
2984 }