2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec_struct.h"
53 #include "gromacs/domdec/partition.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed_forces/bonded.h"
65 #include "gromacs/listed_forces/disre.h"
66 #include "gromacs/listed_forces/gpubonded.h"
67 #include "gromacs/listed_forces/manage_threading.h"
68 #include "gromacs/listed_forces/orires.h"
69 #include "gromacs/math/arrayrefwithpadding.h"
70 #include "gromacs/math/functions.h"
71 #include "gromacs/math/units.h"
72 #include "gromacs/math/vec.h"
73 #include "gromacs/math/vecdump.h"
74 #include "gromacs/mdlib/calcmu.h"
75 #include "gromacs/mdlib/calcvir.h"
76 #include "gromacs/mdlib/constr.h"
77 #include "gromacs/mdlib/force.h"
78 #include "gromacs/mdlib/forcerec.h"
79 #include "gromacs/mdlib/gmx_omp_nthreads.h"
80 #include "gromacs/mdlib/mdrun.h"
81 #include "gromacs/mdlib/nb_verlet.h"
82 #include "gromacs/mdlib/nbnxn_atomdata.h"
83 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
84 #include "gromacs/mdlib/nbnxn_grid.h"
85 #include "gromacs/mdlib/nbnxn_search.h"
86 #include "gromacs/mdlib/ppforceworkload.h"
87 #include "gromacs/mdlib/qmmm.h"
88 #include "gromacs/mdlib/update.h"
89 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
90 #include "gromacs/mdtypes/commrec.h"
91 #include "gromacs/mdtypes/forceoutput.h"
92 #include "gromacs/mdtypes/iforceprovider.h"
93 #include "gromacs/mdtypes/inputrec.h"
94 #include "gromacs/mdtypes/md_enums.h"
95 #include "gromacs/mdtypes/state.h"
96 #include "gromacs/pbcutil/ishift.h"
97 #include "gromacs/pbcutil/mshift.h"
98 #include "gromacs/pbcutil/pbc.h"
99 #include "gromacs/pulling/pull.h"
100 #include "gromacs/pulling/pull_rotation.h"
101 #include "gromacs/timing/cyclecounter.h"
102 #include "gromacs/timing/gpu_timing.h"
103 #include "gromacs/timing/wallcycle.h"
104 #include "gromacs/timing/wallcyclereporting.h"
105 #include "gromacs/timing/walltime_accounting.h"
106 #include "gromacs/topology/topology.h"
107 #include "gromacs/utility/arrayref.h"
108 #include "gromacs/utility/basedefinitions.h"
109 #include "gromacs/utility/cstringutil.h"
110 #include "gromacs/utility/exceptions.h"
111 #include "gromacs/utility/fatalerror.h"
112 #include "gromacs/utility/gmxassert.h"
113 #include "gromacs/utility/gmxmpi.h"
114 #include "gromacs/utility/logger.h"
115 #include "gromacs/utility/smalloc.h"
116 #include "gromacs/utility/strconvert.h"
117 #include "gromacs/utility/sysinfo.h"
119 #include "nbnxn_gpu.h"
120 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
121 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
123 // TODO: this environment variable allows us to verify before release
124 // that on less common architectures the total cost of polling is not larger than
125 // a blocking wait (so polling does not introduce overhead when the static
126 // PME-first ordering would suffice).
127 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
130 void print_time(FILE *out,
131 gmx_walltime_accounting_t walltime_accounting,
133 const t_inputrec *ir,
137 double dt, elapsed_seconds, time_per_step;
146 fputs(gmx::int64ToString(step).c_str(), out);
149 if ((step >= ir->nstlist))
151 double seconds_since_epoch = gmx_gettime();
152 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
153 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
154 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
160 finish = static_cast<time_t>(seconds_since_epoch + dt);
161 auto timebuf = gmx_ctime_r(&finish);
162 timebuf.erase(timebuf.find_first_of('\n'));
163 fputs(", will finish ", out);
164 fputs(timebuf.c_str(), out);
168 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
173 fprintf(out, " performance: %.1f ns/day ",
174 ir->delta_t/1000*24*60*60/time_per_step);
183 GMX_UNUSED_VALUE(cr);
189 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
197 time_t temp_time = static_cast<time_t>(the_time);
199 auto timebuf = gmx_ctime_r(&temp_time);
201 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
204 void print_start(FILE *fplog, const t_commrec *cr,
205 gmx_walltime_accounting_t walltime_accounting,
210 sprintf(buf, "Started %s", name);
211 print_date_and_time(fplog, cr->nodeid, buf,
212 walltime_accounting_get_start_time_stamp(walltime_accounting));
215 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
217 const int end = forceToAdd.size();
219 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
220 #pragma omp parallel for num_threads(nt) schedule(static)
221 for (int i = 0; i < end; i++)
223 rvec_inc(f[i], forceToAdd[i]);
227 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
228 tensor vir_part, const t_graph *graph, const matrix box,
229 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
231 /* The short-range virial from surrounding boxes */
232 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
233 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
235 /* Calculate partial virial, for local atoms only, based on short range.
236 * Total virial is computed in global_stat, called from do_md
238 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
239 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
243 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
247 static void pull_potential_wrapper(const t_commrec *cr,
248 const t_inputrec *ir,
249 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
250 gmx::ForceWithVirial *force,
251 const t_mdatoms *mdatoms,
252 gmx_enerdata_t *enerd,
255 gmx_wallcycle_t wcycle)
260 /* Calculate the center of mass forces, this requires communication,
261 * which is why pull_potential is called close to other communication.
263 wallcycle_start(wcycle, ewcPULLPOT);
264 set_pbc(&pbc, ir->ePBC, box);
266 enerd->term[F_COM_PULL] +=
267 pull_potential(ir->pull_work, mdatoms, &pbc,
268 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
269 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
270 wallcycle_stop(wcycle, ewcPULLPOT);
273 static void pme_receive_force_ener(const t_commrec *cr,
274 gmx::ForceWithVirial *forceWithVirial,
275 gmx_enerdata_t *enerd,
276 gmx_wallcycle_t wcycle)
278 real e_q, e_lj, dvdl_q, dvdl_lj;
279 float cycles_ppdpme, cycles_seppme;
281 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
282 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
284 /* In case of node-splitting, the PP nodes receive the long-range
285 * forces, virial and energy from the PME nodes here.
287 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
290 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
292 enerd->term[F_COUL_RECIP] += e_q;
293 enerd->term[F_LJ_RECIP] += e_lj;
294 enerd->dvdl_lin[efptCOUL] += dvdl_q;
295 enerd->dvdl_lin[efptVDW] += dvdl_lj;
299 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
301 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
304 static void print_large_forces(FILE *fp,
312 real force2Tolerance = gmx::square(forceTolerance);
313 gmx::index numNonFinite = 0;
314 for (int i = 0; i < md->homenr; i++)
316 real force2 = norm2(f[i]);
317 bool nonFinite = !std::isfinite(force2);
318 if (force2 >= force2Tolerance || nonFinite)
320 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
322 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
329 if (numNonFinite > 0)
331 /* Note that with MPI this fatal call on one rank might interrupt
332 * the printing on other ranks. But we can only avoid that with
333 * an expensive MPI barrier that we would need at each step.
335 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
339 static void post_process_forces(const t_commrec *cr,
342 gmx_wallcycle_t wcycle,
343 const gmx_localtop_t *top,
347 gmx::ForceWithVirial *forceWithVirial,
349 const t_mdatoms *mdatoms,
350 const t_graph *graph,
351 const t_forcerec *fr,
352 const gmx_vsite_t *vsite,
355 if (fr->haveDirectVirialContributions)
357 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
361 /* Spread the mesh force on virtual sites to the other particles...
362 * This is parallellized. MPI communication is performed
363 * if the constructing atoms aren't local.
365 matrix virial = { { 0 } };
366 spread_vsite_f(vsite, x, fDirectVir, nullptr,
367 (flags & GMX_FORCE_VIRIAL) != 0, virial,
369 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
370 forceWithVirial->addVirialContribution(virial);
373 if (flags & GMX_FORCE_VIRIAL)
375 /* Now add the forces, this is local */
376 sum_forces(f, forceWithVirial->force_);
378 /* Add the direct virial contributions */
379 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
380 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
384 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
389 if (fr->print_force >= 0)
391 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
395 static void do_nb_verlet(const t_forcerec *fr,
396 const interaction_const_t *ic,
397 gmx_enerdata_t *enerd,
398 int flags, int ilocality,
402 gmx_wallcycle_t wcycle)
404 if (!(flags & GMX_FORCE_NONBONDED))
406 /* skip non-bonded calculation */
410 nonbonded_verlet_t *nbv = fr->nbv;
411 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
413 /* GPU kernel launch overhead is already timed separately */
414 if (fr->cutoff_scheme != ecutsVERLET)
416 gmx_incons("Invalid cut-off scheme passed!");
419 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
421 if (!bUsingGpuKernels)
423 /* When dynamic pair-list pruning is requested, we need to prune
424 * at nstlistPrune steps.
426 if (nbv->listParams->useDynamicPruning &&
427 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
429 /* Prune the pair-list beyond fr->ic->rlistPrune using
430 * the current coordinates of the atoms.
432 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
433 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
434 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
437 wallcycle_sub_start(wcycle, ewcsNONBONDED);
440 switch (nbvg->kernel_type)
442 case nbnxnk4x4_PlainC:
443 case nbnxnk4xN_SIMD_4xN:
444 case nbnxnk4xN_SIMD_2xNN:
445 nbnxn_kernel_cpu(nbvg,
452 enerd->grpp.ener[egCOULSR],
454 enerd->grpp.ener[egBHAMSR] :
455 enerd->grpp.ener[egLJSR]);
458 case nbnxnk8x8x8_GPU:
459 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
462 case nbnxnk8x8x8_PlainC:
463 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nblGpu[0],
470 enerd->grpp.ener[egCOULSR],
472 enerd->grpp.ener[egBHAMSR] :
473 enerd->grpp.ener[egLJSR]);
477 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
480 if (!bUsingGpuKernels)
482 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
485 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
486 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
488 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
490 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
491 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
493 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
497 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
499 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
500 if (flags & GMX_FORCE_ENERGY)
502 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
503 enr_nbnxn_kernel_ljc += 1;
504 enr_nbnxn_kernel_lj += 1;
507 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
508 nbvg->nbl_lists.natpair_ljq);
509 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
510 nbvg->nbl_lists.natpair_lj);
511 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
512 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
513 nbvg->nbl_lists.natpair_q);
515 if (ic->vdw_modifier == eintmodFORCESWITCH)
517 /* We add up the switch cost separately */
518 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
519 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
521 if (ic->vdw_modifier == eintmodPOTSWITCH)
523 /* We add up the switch cost separately */
524 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
525 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
527 if (ic->vdwtype == evdwPME)
529 /* We add up the LJ Ewald cost separately */
530 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
531 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
535 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
539 const t_mdatoms *mdatoms,
542 gmx_enerdata_t *enerd,
545 gmx_wallcycle_t wcycle)
548 nb_kernel_data_t kernel_data;
550 real dvdl_nb[efptNR];
555 /* Add short-range interactions */
556 donb_flags |= GMX_NONBONDED_DO_SR;
558 /* Currently all group scheme kernels always calculate (shift-)forces */
559 if (flags & GMX_FORCE_FORCES)
561 donb_flags |= GMX_NONBONDED_DO_FORCE;
563 if (flags & GMX_FORCE_VIRIAL)
565 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
567 if (flags & GMX_FORCE_ENERGY)
569 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
572 kernel_data.flags = donb_flags;
573 kernel_data.lambda = lambda;
574 kernel_data.dvdl = dvdl_nb;
576 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
577 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
579 /* reset free energy components */
580 for (i = 0; i < efptNR; i++)
585 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
587 wallcycle_sub_start(wcycle, ewcsNONBONDED);
588 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
589 for (th = 0; th < nbl_lists->nnbl; th++)
593 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
594 x, f, fr, mdatoms, &kernel_data, nrnb);
596 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
599 if (fepvals->sc_alpha != 0)
601 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
602 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
606 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
607 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
610 /* If we do foreign lambda and we have soft-core interactions
611 * we have to recalculate the (non-linear) energies contributions.
613 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
615 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
616 kernel_data.lambda = lam_i;
617 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
618 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
619 /* Note that we add to kernel_data.dvdl, but ignore the result */
621 for (i = 0; i < enerd->n_lambda; i++)
623 for (j = 0; j < efptNR; j++)
625 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
627 reset_foreign_enerdata(enerd);
628 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
629 for (th = 0; th < nbl_lists->nnbl; th++)
633 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
634 x, f, fr, mdatoms, &kernel_data, nrnb);
636 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
639 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
640 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
644 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
647 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
649 return nbv != nullptr && nbv->bUseGPU;
652 static inline void clear_rvecs_omp(int n, rvec v[])
654 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
656 /* Note that we would like to avoid this conditional by putting it
657 * into the omp pragma instead, but then we still take the full
658 * omp parallel for overhead (at least with gcc5).
662 for (int i = 0; i < n; i++)
669 #pragma omp parallel for num_threads(nth) schedule(static)
670 for (int i = 0; i < n; i++)
677 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
679 * \param groupOptions Group options, containing T-coupling options
681 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
683 real nrdfCoupled = 0;
684 real nrdfUncoupled = 0;
685 real kineticEnergy = 0;
686 for (int g = 0; g < groupOptions.ngtc; g++)
688 if (groupOptions.tau_t[g] >= 0)
690 nrdfCoupled += groupOptions.nrdf[g];
691 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
695 nrdfUncoupled += groupOptions.nrdf[g];
699 /* This conditional with > also catches nrdf=0 */
700 if (nrdfCoupled > nrdfUncoupled)
702 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
710 /*! \brief This routine checks that the potential energy is finite.
712 * Always checks that the potential energy is finite. If step equals
713 * inputrec.init_step also checks that the magnitude of the potential energy
714 * is reasonable. Terminates with a fatal error when a check fails.
715 * Note that passing this check does not guarantee finite forces,
716 * since those use slightly different arithmetics. But in most cases
717 * there is just a narrow coordinate range where forces are not finite
718 * and energies are finite.
720 * \param[in] step The step number, used for checking and printing
721 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
722 * \param[in] inputrec The input record
724 static void checkPotentialEnergyValidity(int64_t step,
725 const gmx_enerdata_t &enerd,
726 const t_inputrec &inputrec)
728 /* Threshold valid for comparing absolute potential energy against
729 * the kinetic energy. Normally one should not consider absolute
730 * potential energy values, but with a factor of one million
731 * we should never get false positives.
733 constexpr real c_thresholdFactor = 1e6;
735 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
736 real averageKineticEnergy = 0;
737 /* We only check for large potential energy at the initial step,
738 * because that is by far the most likely step for this too occur
739 * and because computing the average kinetic energy is not free.
740 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
741 * before they become NaN.
743 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
745 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
748 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
749 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
751 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
754 energyIsNotFinite ? "not finite" : "extremely high",
756 enerd.term[F_COUL_SR],
757 energyIsNotFinite ? "non-finite" : "very high",
758 energyIsNotFinite ? " or Nan" : "");
762 /*! \brief Compute forces and/or energies for special algorithms
764 * The intention is to collect all calls to algorithms that compute
765 * forces on local atoms only and that do not contribute to the local
766 * virial sum (but add their virial contribution separately).
767 * Eventually these should likely all become ForceProviders.
768 * Within this function the intention is to have algorithms that do
769 * global communication at the end, so global barriers within the MD loop
770 * are as close together as possible.
772 * \param[in] fplog The log file
773 * \param[in] cr The communication record
774 * \param[in] inputrec The input record
775 * \param[in] awh The Awh module (nullptr if none in use).
776 * \param[in] enforcedRotation Enforced rotation module.
777 * \param[in] step The current MD step
778 * \param[in] t The current time
779 * \param[in,out] wcycle Wallcycle accounting struct
780 * \param[in,out] forceProviders Pointer to a list of force providers
781 * \param[in] box The unit cell
782 * \param[in] x The coordinates
783 * \param[in] mdatoms Per atom properties
784 * \param[in] lambda Array of free-energy lambda values
785 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
786 * \param[in,out] forceWithVirial Force and virial buffers
787 * \param[in,out] enerd Energy buffer
788 * \param[in,out] ed Essential dynamics pointer
789 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
791 * \todo Remove bNS, which is used incorrectly.
792 * \todo Convert all other algorithms called here to ForceProviders.
795 computeSpecialForces(FILE *fplog,
797 const t_inputrec *inputrec,
799 gmx_enfrot *enforcedRotation,
802 gmx_wallcycle_t wcycle,
803 ForceProviders *forceProviders,
805 gmx::ArrayRef<const gmx::RVec> x,
806 const t_mdatoms *mdatoms,
809 gmx::ForceWithVirial *forceWithVirial,
810 gmx_enerdata_t *enerd,
814 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
816 /* NOTE: Currently all ForceProviders only provide forces.
817 * When they also provide energies, remove this conditional.
821 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
822 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
824 /* Collect forces from modules */
825 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
828 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
830 pull_potential_wrapper(cr, inputrec, box, x,
832 mdatoms, enerd, lambda, t,
837 enerd->term[F_COM_PULL] +=
838 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
840 t, step, wcycle, fplog);
844 rvec *f = as_rvec_array(forceWithVirial->force_.data());
846 /* Add the forces from enforced rotation potentials (if any) */
849 wallcycle_start(wcycle, ewcROTadd);
850 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
851 wallcycle_stop(wcycle, ewcROTadd);
856 /* Note that since init_edsam() is called after the initialization
857 * of forcerec, edsam doesn't request the noVirSum force buffer.
858 * Thus if no other algorithm (e.g. PME) requires it, the forces
859 * here will contribute to the virial.
861 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
864 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
865 if (inputrec->bIMD && computeForces)
867 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
871 /*! \brief Launch the prepare_step and spread stages of PME GPU.
873 * \param[in] pmedata The PME structure
874 * \param[in] box The box matrix
875 * \param[in] x Coordinate array
876 * \param[in] flags Force flags
877 * \param[in] pmeFlags PME flags
878 * \param[in] wcycle The wallcycle structure
880 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
885 gmx_wallcycle_t wcycle)
887 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
888 pme_gpu_launch_spread(pmedata, x, wcycle);
891 /*! \brief Launch the FFT and gather stages of PME GPU
893 * This function only implements setting the output forces (no accumulation).
895 * \param[in] pmedata The PME structure
896 * \param[in] wcycle The wallcycle structure
898 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
899 gmx_wallcycle_t wcycle)
901 pme_gpu_launch_complex_transforms(pmedata, wcycle);
902 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
906 * Polling wait for either of the PME or nonbonded GPU tasks.
908 * Instead of a static order in waiting for GPU tasks, this function
909 * polls checking which of the two tasks completes first, and does the
910 * associated force buffer reduction overlapped with the other task.
911 * By doing that, unlike static scheduling order, it can always overlap
912 * one of the reductions, regardless of the GPU task completion order.
914 * \param[in] nbv Nonbonded verlet structure
915 * \param[in,out] pmedata PME module data
916 * \param[in,out] force Force array to reduce task outputs into.
917 * \param[in,out] forceWithVirial Force and virial buffers
918 * \param[in,out] fshift Shift force output vector results are reduced into
919 * \param[in,out] enerd Energy data structure results are reduced into
920 * \param[in] flags Force flags
921 * \param[in] pmeFlags PME flags
922 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
923 * \param[in] wcycle The wallcycle structure
925 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
927 gmx::ArrayRefWithPadding<gmx::RVec> *force,
928 gmx::ForceWithVirial *forceWithVirial,
930 gmx_enerdata_t *enerd,
934 gmx_wallcycle_t wcycle)
936 bool isPmeGpuDone = false;
937 bool isNbGpuDone = false;
940 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
942 while (!isPmeGpuDone || !isNbGpuDone)
946 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
947 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
952 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
953 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
954 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
957 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
958 fshift, completionType);
959 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
960 // To get the call count right, when the task finished we
961 // issue a start/stop.
962 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
963 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
966 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
967 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
969 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
970 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
977 * Launch the dynamic rolling pruning GPU task.
979 * We currently alternate local/non-local list pruning in odd-even steps
980 * (only pruning every second step without DD).
982 * \param[in] cr The communication record
983 * \param[in] nbv Nonbonded verlet structure
984 * \param[in] inputrec The input record
985 * \param[in] step The current MD step
987 static inline void launchGpuRollingPruning(const t_commrec *cr,
988 const nonbonded_verlet_t *nbv,
989 const t_inputrec *inputrec,
992 /* We should not launch the rolling pruning kernel at a search
993 * step or just before search steps, since that's useless.
994 * Without domain decomposition we prune at even steps.
995 * With domain decomposition we alternate local and non-local
996 * pruning at even and odd steps.
998 int numRollingParts = nbv->listParams->numRollingParts;
999 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1000 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1001 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1002 if (stepWithCurrentList > 0 &&
1003 stepWithCurrentList < inputrec->nstlist - 1 &&
1004 (stepIsEven || DOMAINDECOMP(cr)))
1006 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1007 stepIsEven ? eintLocal : eintNonlocal,
1012 static void do_force_cutsVERLET(FILE *fplog,
1013 const t_commrec *cr,
1014 const gmx_multisim_t *ms,
1015 const t_inputrec *inputrec,
1017 gmx_enfrot *enforcedRotation,
1020 gmx_wallcycle_t wcycle,
1021 const gmx_localtop_t *top,
1022 const gmx_groups_t * /* groups */,
1023 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1025 gmx::ArrayRefWithPadding<gmx::RVec> force,
1027 const t_mdatoms *mdatoms,
1028 gmx_enerdata_t *enerd, t_fcdata *fcd,
1032 gmx::PpForceWorkload *ppForceWorkload,
1033 interaction_const_t *ic,
1034 const gmx_vsite_t *vsite,
1039 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1040 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1044 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1045 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1046 rvec vzero, box_diag;
1047 float cycles_pme, cycles_wait_gpu;
1048 nonbonded_verlet_t *nbv = fr->nbv;
1050 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1051 bNS = ((flags & GMX_FORCE_NS) != 0);
1052 bFillGrid = (bNS && bStateChanged);
1053 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1054 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1055 bUseGPU = fr->nbv->bUseGPU;
1056 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1058 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1059 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1060 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1061 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1062 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
1063 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
1064 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
1065 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
1067 /* At a search step we need to start the first balancing region
1068 * somewhere early inside the step after communication during domain
1069 * decomposition (and not during the previous step as usual).
1072 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1074 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1077 cycles_wait_gpu = 0;
1079 const int start = 0;
1080 const int homenr = mdatoms->homenr;
1082 clear_mat(vir_force);
1084 if (DOMAINDECOMP(cr))
1086 cg1 = cr->dd->globalAtomGroupIndices.size();
1099 update_forcerec(fr, box);
1101 if (inputrecNeedMutot(inputrec))
1103 /* Calculate total (local) dipole moment in a temporary common array.
1104 * This makes it possible to sum them over nodes faster.
1106 calc_mu(start, homenr,
1107 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1112 if (fr->ePBC != epbcNONE)
1114 /* Compute shift vectors every step,
1115 * because of pressure coupling or box deformation!
1117 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1119 calc_shifts(box, fr->shift_vec);
1124 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1125 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1127 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1129 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1133 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1134 fr->shift_vec, nbv->nbat);
1137 if (!thisRankHasDuty(cr, DUTY_PME))
1139 /* Send particle coordinates to the pme nodes.
1140 * Since this is only implemented for domain decomposition
1141 * and domain decomposition does not use the graph,
1142 * we do not need to worry about shifting.
1144 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1145 lambda[efptCOUL], lambda[efptVDW],
1146 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1149 #endif /* GMX_MPI */
1153 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1156 /* do gridding for pair search */
1159 if (graph && bStateChanged)
1161 /* Calculate intramolecular shift vectors to make molecules whole */
1162 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1166 box_diag[XX] = box[XX][XX];
1167 box_diag[YY] = box[YY][YY];
1168 box_diag[ZZ] = box[ZZ][ZZ];
1170 wallcycle_start(wcycle, ewcNS);
1171 if (!DOMAINDECOMP(cr))
1173 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1174 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1176 nullptr, 0, mdatoms->homenr, -1,
1177 fr->cginfo, x.unpaddedArrayRef(),
1179 nbv->grp[eintLocal].kernel_type,
1181 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1185 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1186 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1187 fr->cginfo, x.unpaddedArrayRef(),
1188 nbv->grp[eintNonlocal].kernel_type,
1190 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1193 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1195 wallcycle_stop(wcycle, ewcNS);
1198 /* initialize the GPU atom data and copy shift vector */
1201 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1202 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1206 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1209 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1211 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1213 if (bNS && fr->gpuBonded)
1215 /* Now we put all atoms on the grid, we can assign bonded
1216 * interactions to the GPU, where the grid order is
1217 * needed. Also the xq, f and fshift device buffers have
1218 * been reallocated if needed, so the bonded code can
1219 * learn about them. */
1220 // TODO the xq, f, and fshift buffers are now shared
1221 // resources, so they should be maintained by a
1222 // higher-level object than the nb module.
1223 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1225 nbnxn_gpu_get_xq(nbv->gpu_nbv),
1226 nbnxn_gpu_get_f(nbv->gpu_nbv),
1227 nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1228 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1231 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1234 /* do local pair search */
1237 wallcycle_start_nocount(wcycle, ewcNS);
1238 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1239 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1241 nbv->listParams->rlistOuter,
1242 nbv->min_ci_balanced,
1243 &nbv->grp[eintLocal].nbl_lists,
1245 nbv->grp[eintLocal].kernel_type,
1247 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1248 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1250 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1252 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1256 /* initialize local pair-list on the GPU */
1257 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1258 nbv->grp[eintLocal].nbl_lists.nblGpu[0],
1261 wallcycle_stop(wcycle, ewcNS);
1265 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1271 if (DOMAINDECOMP(cr))
1273 ddOpenBalanceRegionGpu(cr->dd);
1276 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1278 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1279 nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
1280 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282 // bonded work not split into separate local and non-local, so with DD
1283 // we can only launch the kernel after non-local coordinates have been received.
1284 if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1286 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1287 fr->gpuBonded->launchKernels(fr, flags, box);
1288 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1291 /* launch local nonbonded work on GPU */
1292 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1294 step, nrnb, wcycle);
1295 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1296 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1301 // In PME GPU and mixed mode we launch FFT / gather after the
1302 // X copy/transform to allow overlap as well as after the GPU NB
1303 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1304 // the nonbonded kernel.
1305 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1308 /* Communicate coordinates and sum dipole if necessary +
1309 do non-local pair search */
1310 if (DOMAINDECOMP(cr))
1314 wallcycle_start_nocount(wcycle, ewcNS);
1315 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1317 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1319 nbv->listParams->rlistOuter,
1320 nbv->min_ci_balanced,
1321 &nbv->grp[eintNonlocal].nbl_lists,
1323 nbv->grp[eintNonlocal].kernel_type,
1325 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1326 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1328 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1330 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1332 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1334 /* initialize non-local pair-list on the GPU */
1335 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1336 nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
1339 wallcycle_stop(wcycle, ewcNS);
1343 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1345 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1351 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1353 /* launch non-local nonbonded tasks on GPU */
1354 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1355 nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1356 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1358 if (ppForceWorkload->haveGpuBondedWork)
1360 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1361 fr->gpuBonded->launchKernels(fr, flags, box);
1362 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1365 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1366 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1367 step, nrnb, wcycle);
1368 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1370 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1376 /* launch D2H copy-back F */
1377 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1378 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1379 if (DOMAINDECOMP(cr))
1381 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1382 flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1384 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1385 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1386 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1388 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1390 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1391 fr->gpuBonded->launchEnergyTransfer();
1392 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1394 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1397 if (bStateChanged && inputrecNeedMutot(inputrec))
1401 gmx_sumd(2*DIM, mu, cr);
1402 ddReopenBalanceRegionCpu(cr->dd);
1405 for (i = 0; i < 2; i++)
1407 for (j = 0; j < DIM; j++)
1409 fr->mu_tot[i][j] = mu[i*DIM + j];
1413 if (fr->efep == efepNO)
1415 copy_rvec(fr->mu_tot[0], mu_tot);
1419 for (j = 0; j < DIM; j++)
1422 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1423 lambda[efptCOUL]*fr->mu_tot[1][j];
1427 /* Reset energies */
1428 reset_enerdata(enerd);
1429 clear_rvecs(SHIFTS, fr->fshift);
1431 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1433 wallcycle_start(wcycle, ewcPPDURINGPME);
1434 dd_force_flop_start(cr->dd, nrnb);
1439 wallcycle_start(wcycle, ewcROT);
1440 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1441 wallcycle_stop(wcycle, ewcROT);
1444 /* Temporary solution until all routines take PaddedRVecVector */
1445 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1447 /* Start the force cycle counter.
1448 * Note that a different counter is used for dynamic load balancing.
1450 wallcycle_start(wcycle, ewcFORCE);
1452 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1455 /* If we need to compute the virial, we might need a separate
1456 * force buffer for algorithms for which the virial is calculated
1457 * directly, such as PME.
1459 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1461 forceRef = *fr->forceBufferForDirectVirialContributions;
1463 /* TODO: update comment
1464 * We only compute forces on local atoms. Note that vsites can
1465 * spread to non-local atoms, but that part of the buffer is
1466 * cleared separately in the vsite spreading code.
1468 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1470 /* Clear the short- and long-range forces */
1471 clear_rvecs_omp(fr->natoms_force_constr, f);
1474 /* forceWithVirial uses the local atom range only */
1475 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1477 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1479 clear_pull_forces(inputrec->pull_work);
1482 /* We calculate the non-bonded forces, when done on the CPU, here.
1483 * We do this before calling do_force_lowlevel, because in that
1484 * function, the listed forces are calculated before PME, which
1485 * does communication. With this order, non-bonded and listed
1486 * force calculation imbalance can be balanced out by the domain
1487 * decomposition load balancing.
1492 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1493 step, nrnb, wcycle);
1496 if (fr->efep != efepNO)
1498 /* Calculate the local and non-local free energy interactions here.
1499 * Happens here on the CPU both with and without GPU.
1501 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1503 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1504 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1505 inputrec->fepvals, lambda,
1506 enerd, flags, nrnb, wcycle);
1509 if (DOMAINDECOMP(cr) &&
1510 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1512 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1513 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1514 inputrec->fepvals, lambda,
1515 enerd, flags, nrnb, wcycle);
1523 if (DOMAINDECOMP(cr))
1525 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1526 step, nrnb, wcycle);
1535 aloc = eintNonlocal;
1538 /* Add all the non-bonded force to the normal force array.
1539 * This can be split into a local and a non-local part when overlapping
1540 * communication with calculation with domain decomposition.
1542 wallcycle_stop(wcycle, ewcFORCE);
1544 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1546 wallcycle_start_nocount(wcycle, ewcFORCE);
1548 /* if there are multiple fshift output buffers reduce them */
1549 if ((flags & GMX_FORCE_VIRIAL) &&
1550 nbv->grp[aloc].nbl_lists.nnbl > 1)
1552 /* This is not in a subcounter because it takes a
1553 negligible and constant-sized amount of time */
1554 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1559 /* update QMMMrec, if necessary */
1562 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1565 /* Compute the bonded and non-bonded energies and optionally forces */
1566 do_force_lowlevel(fr, inputrec, &(top->idef),
1567 cr, ms, nrnb, wcycle, mdatoms,
1568 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1569 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1570 flags, &cycles_pme);
1572 wallcycle_stop(wcycle, ewcFORCE);
1574 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1576 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1577 flags, &forceWithVirial, enerd,
1582 /* wait for non-local forces (or calculate in emulation mode) */
1583 if (DOMAINDECOMP(cr))
1587 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1588 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1590 ppForceWorkload->haveGpuBondedWork,
1591 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1593 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1597 wallcycle_start_nocount(wcycle, ewcFORCE);
1598 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1599 step, nrnb, wcycle);
1600 wallcycle_stop(wcycle, ewcFORCE);
1603 /* skip the reduction if there was no non-local work to do */
1604 if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
1606 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1607 nbv->nbat, f, wcycle);
1612 if (DOMAINDECOMP(cr))
1614 /* We are done with the CPU compute.
1615 * We will now communicate the non-local forces.
1616 * If we use a GPU this will overlap with GPU work, so in that case
1617 * we do not close the DD force balancing region here.
1619 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1621 ddCloseBalanceRegionCpu(cr->dd);
1625 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1629 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1630 // an alternating wait/reduction scheme.
1631 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1632 if (alternateGpuWait)
1634 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1637 if (!alternateGpuWait && useGpuPme)
1639 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1642 /* Wait for local GPU NB outputs on the non-alternating wait path */
1643 if (!alternateGpuWait && bUseGPU)
1645 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1646 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1647 * but even with a step of 0.1 ms the difference is less than 1%
1650 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1652 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1653 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1654 flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1655 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1657 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1659 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1661 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1662 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1664 /* We measured few cycles, it could be that the kernel
1665 * and transfer finished earlier and there was no actual
1666 * wait time, only API call overhead.
1667 * Then the actual time could be anywhere between 0 and
1668 * cycles_wait_est. We will use half of cycles_wait_est.
1670 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1672 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1676 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1678 // NOTE: emulation kernel is not included in the balancing region,
1679 // but emulation mode does not target performance anyway
1680 wallcycle_start_nocount(wcycle, ewcFORCE);
1681 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1682 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1683 step, nrnb, wcycle);
1684 wallcycle_stop(wcycle, ewcFORCE);
1689 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1694 /* now clear the GPU outputs while we finish the step on the CPU */
1695 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1696 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1697 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1699 /* Is dynamic pair-list pruning activated? */
1700 if (nbv->listParams->useDynamicPruning)
1702 launchGpuRollingPruning(cr, nbv, inputrec, step);
1704 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1705 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1708 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1710 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1711 // in principle this should be included in the DD balancing region,
1712 // but generally it is infrequent so we'll omit it for the sake of
1714 fr->gpuBonded->accumulateEnergyTerms(enerd);
1715 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1717 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1718 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1719 fr->gpuBonded->clearEnergies();
1720 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1721 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1724 /* Do the nonbonded GPU (or emulation) force buffer reduction
1725 * on the non-alternating path. */
1726 if (bUseOrEmulGPU && !alternateGpuWait)
1728 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1729 nbv->nbat, f, wcycle);
1731 if (DOMAINDECOMP(cr))
1733 dd_force_flop_stop(cr->dd, nrnb);
1738 /* If we have NoVirSum forces, but we do not calculate the virial,
1739 * we sum fr->f_novirsum=f later.
1741 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1743 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1744 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1747 if (flags & GMX_FORCE_VIRIAL)
1749 /* Calculation of the virial must be done after vsites! */
1750 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1751 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1755 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1757 /* In case of node-splitting, the PP nodes receive the long-range
1758 * forces, virial and energy from the PME nodes here.
1760 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1765 post_process_forces(cr, step, nrnb, wcycle,
1766 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1767 vir_force, mdatoms, graph, fr, vsite,
1771 if (flags & GMX_FORCE_ENERGY)
1773 /* Sum the potential energy terms from group contributions */
1774 sum_epot(&(enerd->grpp), enerd->term);
1776 if (!EI_TPI(inputrec->eI))
1778 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1783 static void do_force_cutsGROUP(FILE *fplog,
1784 const t_commrec *cr,
1785 const gmx_multisim_t *ms,
1786 const t_inputrec *inputrec,
1788 gmx_enfrot *enforcedRotation,
1791 gmx_wallcycle_t wcycle,
1792 gmx_localtop_t *top,
1793 const gmx_groups_t *groups,
1794 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1796 gmx::ArrayRefWithPadding<gmx::RVec> force,
1798 const t_mdatoms *mdatoms,
1799 gmx_enerdata_t *enerd,
1804 const gmx_vsite_t *vsite,
1809 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1810 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1814 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1818 const int start = 0;
1819 const int homenr = mdatoms->homenr;
1821 clear_mat(vir_force);
1824 if (DOMAINDECOMP(cr))
1826 cg1 = cr->dd->globalAtomGroupIndices.size();
1837 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1838 bNS = ((flags & GMX_FORCE_NS) != 0);
1839 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1840 bFillGrid = (bNS && bStateChanged);
1841 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1842 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1846 update_forcerec(fr, box);
1848 if (inputrecNeedMutot(inputrec))
1850 /* Calculate total (local) dipole moment in a temporary common array.
1851 * This makes it possible to sum them over nodes faster.
1853 calc_mu(start, homenr,
1854 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1859 if (fr->ePBC != epbcNONE)
1861 /* Compute shift vectors every step,
1862 * because of pressure coupling or box deformation!
1864 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1866 calc_shifts(box, fr->shift_vec);
1871 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1872 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1873 inc_nrnb(nrnb, eNR_CGCM, homenr);
1874 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1876 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1878 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1883 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1884 inc_nrnb(nrnb, eNR_CGCM, homenr);
1887 if (bCalcCGCM && gmx_debug_at)
1889 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1893 if (!thisRankHasDuty(cr, DUTY_PME))
1895 /* Send particle coordinates to the pme nodes.
1896 * Since this is only implemented for domain decomposition
1897 * and domain decomposition does not use the graph,
1898 * we do not need to worry about shifting.
1900 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1901 lambda[efptCOUL], lambda[efptVDW],
1902 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1905 #endif /* GMX_MPI */
1907 /* Communicate coordinates and sum dipole if necessary */
1908 if (DOMAINDECOMP(cr))
1910 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1912 /* No GPU support, no move_x overlap, so reopen the balance region here */
1913 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1915 ddReopenBalanceRegionCpu(cr->dd);
1919 if (inputrecNeedMutot(inputrec))
1925 gmx_sumd(2*DIM, mu, cr);
1926 ddReopenBalanceRegionCpu(cr->dd);
1928 for (i = 0; i < 2; i++)
1930 for (j = 0; j < DIM; j++)
1932 fr->mu_tot[i][j] = mu[i*DIM + j];
1936 if (fr->efep == efepNO)
1938 copy_rvec(fr->mu_tot[0], mu_tot);
1942 for (j = 0; j < DIM; j++)
1945 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1950 /* Reset energies */
1951 reset_enerdata(enerd);
1952 clear_rvecs(SHIFTS, fr->fshift);
1956 wallcycle_start(wcycle, ewcNS);
1958 if (graph && bStateChanged)
1960 /* Calculate intramolecular shift vectors to make molecules whole */
1961 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1964 /* Do the actual neighbour searching */
1966 groups, top, mdatoms,
1967 cr, nrnb, bFillGrid);
1969 wallcycle_stop(wcycle, ewcNS);
1972 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1974 wallcycle_start(wcycle, ewcPPDURINGPME);
1975 dd_force_flop_start(cr->dd, nrnb);
1980 wallcycle_start(wcycle, ewcROT);
1981 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1982 wallcycle_stop(wcycle, ewcROT);
1985 /* Temporary solution until all routines take PaddedRVecVector */
1986 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1988 /* Start the force cycle counter.
1989 * Note that a different counter is used for dynamic load balancing.
1991 wallcycle_start(wcycle, ewcFORCE);
1993 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1996 /* If we need to compute the virial, we might need a separate
1997 * force buffer for algorithms for which the virial is calculated
1998 * separately, such as PME.
2000 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
2002 forceRef = *fr->forceBufferForDirectVirialContributions;
2004 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
2007 /* Clear the short- and long-range forces */
2008 clear_rvecs(fr->natoms_force_constr, f);
2011 /* forceWithVirial might need the full force atom range */
2012 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
2014 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
2016 clear_pull_forces(inputrec->pull_work);
2019 /* update QMMMrec, if necessary */
2022 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
2025 /* Compute the bonded and non-bonded energies and optionally forces */
2026 do_force_lowlevel(fr, inputrec, &(top->idef),
2027 cr, ms, nrnb, wcycle, mdatoms,
2028 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
2029 box, inputrec->fepvals, lambda,
2030 graph, &(top->excls), fr->mu_tot,
2034 wallcycle_stop(wcycle, ewcFORCE);
2036 if (DOMAINDECOMP(cr))
2038 dd_force_flop_stop(cr->dd, nrnb);
2040 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2042 ddCloseBalanceRegionCpu(cr->dd);
2046 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2048 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
2049 flags, &forceWithVirial, enerd,
2054 /* Communicate the forces */
2055 if (DOMAINDECOMP(cr))
2057 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
2058 /* Do we need to communicate the separate force array
2059 * for terms that do not contribute to the single sum virial?
2060 * Position restraints and electric fields do not introduce
2061 * inter-cg forces, only full electrostatics methods do.
2062 * When we do not calculate the virial, fr->f_novirsum = f,
2063 * so we have already communicated these forces.
2065 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2066 (flags & GMX_FORCE_VIRIAL))
2068 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2072 /* If we have NoVirSum forces, but we do not calculate the virial,
2073 * we sum fr->f_novirsum=f later.
2075 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2077 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
2078 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2081 if (flags & GMX_FORCE_VIRIAL)
2083 /* Calculation of the virial must be done after vsites! */
2084 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
2085 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2089 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2091 /* In case of node-splitting, the PP nodes receive the long-range
2092 * forces, virial and energy from the PME nodes here.
2094 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2099 post_process_forces(cr, step, nrnb, wcycle,
2100 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2101 vir_force, mdatoms, graph, fr, vsite,
2105 if (flags & GMX_FORCE_ENERGY)
2107 /* Sum the potential energy terms from group contributions */
2108 sum_epot(&(enerd->grpp), enerd->term);
2110 if (!EI_TPI(inputrec->eI))
2112 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2118 void do_force(FILE *fplog,
2119 const t_commrec *cr,
2120 const gmx_multisim_t *ms,
2121 const t_inputrec *inputrec,
2123 gmx_enfrot *enforcedRotation,
2126 gmx_wallcycle_t wcycle,
2127 gmx_localtop_t *top,
2128 const gmx_groups_t *groups,
2130 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
2132 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
2134 const t_mdatoms *mdatoms,
2135 gmx_enerdata_t *enerd,
2137 gmx::ArrayRef<real> lambda,
2140 gmx::PpForceWorkload *ppForceWorkload,
2141 const gmx_vsite_t *vsite,
2146 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2147 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2149 /* modify force flag if not doing nonbonded */
2150 if (!fr->bNonbonded)
2152 flags &= ~GMX_FORCE_NONBONDED;
2155 switch (inputrec->cutoff_scheme)
2158 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2159 awh, enforcedRotation, step, nrnb, wcycle,
2166 lambda.data(), graph,
2173 ddOpenBalanceRegion,
2174 ddCloseBalanceRegion);
2177 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2178 awh, enforcedRotation, step, nrnb, wcycle,
2185 lambda.data(), graph,
2189 ddOpenBalanceRegion,
2190 ddCloseBalanceRegion);
2193 gmx_incons("Invalid cut-off scheme passed!");
2196 /* In case we don't have constraints and are using GPUs, the next balancing
2197 * region starts here.
2198 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2199 * virial calculation and COM pulling, is not thus not included in
2200 * the balance timing, which is ok as most tasks do communication.
2202 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2204 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2209 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2210 const t_inputrec *ir, const t_mdatoms *md,
2213 int i, m, start, end;
2215 real dt = ir->delta_t;
2219 /* We need to allocate one element extra, since we might use
2220 * (unaligned) 4-wide SIMD loads to access rvec entries.
2222 snew(savex, state->natoms + 1);
2229 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2230 start, md->homenr, end);
2232 /* Do a first constrain to reset particles... */
2233 step = ir->init_step;
2236 char buf[STEPSTRSIZE];
2237 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2238 gmx_step_str(step, buf));
2242 /* constrain the current position */
2243 constr->apply(TRUE, FALSE,
2245 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2247 state->lambda[efptBONDED], &dvdl_dum,
2248 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2251 /* constrain the inital velocity, and save it */
2252 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2253 constr->apply(TRUE, FALSE,
2255 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2257 state->lambda[efptBONDED], &dvdl_dum,
2258 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2260 /* constrain the inital velocities at t-dt/2 */
2261 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2263 auto x = makeArrayRef(state->x).subArray(start, end);
2264 auto v = makeArrayRef(state->v).subArray(start, end);
2265 for (i = start; (i < end); i++)
2267 for (m = 0; (m < DIM); m++)
2269 /* Reverse the velocity */
2271 /* Store the position at t-dt in buf */
2272 savex[i][m] = x[i][m] + dt*v[i][m];
2275 /* Shake the positions at t=-dt with the positions at t=0
2276 * as reference coordinates.
2280 char buf[STEPSTRSIZE];
2281 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2282 gmx_step_str(step, buf));
2285 constr->apply(TRUE, FALSE,
2287 state->x.rvec_array(), savex, nullptr,
2289 state->lambda[efptBONDED], &dvdl_dum,
2290 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2292 for (i = start; i < end; i++)
2294 for (m = 0; m < DIM; m++)
2296 /* Re-reverse the velocities */
2306 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2307 double *enerout, double *virout)
2309 double enersum, virsum;
2310 double invscale, invscale2, invscale3;
2311 double r, ea, eb, ec, pa, pb, pc, pd;
2316 invscale = 1.0/scale;
2317 invscale2 = invscale*invscale;
2318 invscale3 = invscale*invscale2;
2320 /* Following summation derived from cubic spline definition,
2321 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2322 * the cubic spline. We first calculate the negative of the
2323 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2324 * add the more standard, abrupt cutoff correction to that result,
2325 * yielding the long-range correction for a switched function. We
2326 * perform both the pressure and energy loops at the same time for
2327 * simplicity, as the computational cost is low. */
2331 /* Since the dispersion table has been scaled down a factor
2332 * 6.0 and the repulsion a factor 12.0 to compensate for the
2333 * c6/c12 parameters inside nbfp[] being scaled up (to save
2334 * flops in kernels), we need to correct for this.
2345 for (ri = rstart; ri < rend; ++ri)
2349 eb = 2.0*invscale2*r;
2353 pb = 3.0*invscale2*r;
2354 pc = 3.0*invscale*r*r;
2357 /* this "8" is from the packing in the vdwtab array - perhaps
2358 should be defined? */
2360 offset = 8*ri + offstart;
2361 y0 = vdwtab[offset];
2362 f = vdwtab[offset+1];
2363 g = vdwtab[offset+2];
2364 h = vdwtab[offset+3];
2366 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2367 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2369 *enerout = 4.0*M_PI*enersum*tabfactor;
2370 *virout = 4.0*M_PI*virsum*tabfactor;
2373 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2375 double eners[2], virs[2], enersum, virsum;
2376 double r0, rc3, rc9;
2378 real scale, *vdwtab;
2380 fr->enershiftsix = 0;
2381 fr->enershifttwelve = 0;
2382 fr->enerdiffsix = 0;
2383 fr->enerdifftwelve = 0;
2385 fr->virdifftwelve = 0;
2387 const interaction_const_t *ic = fr->ic;
2389 if (eDispCorr != edispcNO)
2391 for (i = 0; i < 2; i++)
2396 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2397 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2398 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2399 (ic->vdwtype == evdwSHIFT) ||
2400 (ic->vdwtype == evdwSWITCH))
2402 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2403 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2404 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2407 "With dispersion correction rvdw-switch can not be zero "
2408 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2411 /* TODO This code depends on the logic in tables.c that
2412 constructs the table layout, which should be made
2413 explicit in future cleanup. */
2414 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2415 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2416 scale = fr->dispersionCorrectionTable->scale;
2417 vdwtab = fr->dispersionCorrectionTable->data;
2419 /* Round the cut-offs to exact table values for precision */
2420 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2421 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2423 /* The code below has some support for handling force-switching, i.e.
2424 * when the force (instead of potential) is switched over a limited
2425 * region. This leads to a constant shift in the potential inside the
2426 * switching region, which we can handle by adding a constant energy
2427 * term in the force-switch case just like when we do potential-shift.
2429 * For now this is not enabled, but to keep the functionality in the
2430 * code we check separately for switch and shift. When we do force-switch
2431 * the shifting point is rvdw_switch, while it is the cutoff when we
2432 * have a classical potential-shift.
2434 * For a pure potential-shift the potential has a constant shift
2435 * all the way out to the cutoff, and that is it. For other forms
2436 * we need to calculate the constant shift up to the point where we
2437 * start modifying the potential.
2439 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2445 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2446 (ic->vdwtype == evdwSHIFT))
2448 /* Determine the constant energy shift below rvdw_switch.
2449 * Table has a scale factor since we have scaled it down to compensate
2450 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2452 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2453 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2455 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2457 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2458 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2461 /* Add the constant part from 0 to rvdw_switch.
2462 * This integration from 0 to rvdw_switch overcounts the number
2463 * of interactions by 1, as it also counts the self interaction.
2464 * We will correct for this later.
2466 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2467 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2469 /* Calculate the contribution in the range [r0,r1] where we
2470 * modify the potential. For a pure potential-shift modifier we will
2471 * have ri0==ri1, and there will not be any contribution here.
2473 for (i = 0; i < 2; i++)
2477 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2478 eners[i] -= enersum;
2482 /* Alright: Above we compensated by REMOVING the parts outside r0
2483 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2485 * Regardless of whether r0 is the point where we start switching,
2486 * or the cutoff where we calculated the constant shift, we include
2487 * all the parts we are missing out to infinity from r0 by
2488 * calculating the analytical dispersion correction.
2490 eners[0] += -4.0*M_PI/(3.0*rc3);
2491 eners[1] += 4.0*M_PI/(9.0*rc9);
2492 virs[0] += 8.0*M_PI/rc3;
2493 virs[1] += -16.0*M_PI/(3.0*rc9);
2495 else if (ic->vdwtype == evdwCUT ||
2496 EVDW_PME(ic->vdwtype) ||
2497 ic->vdwtype == evdwUSER)
2499 if (ic->vdwtype == evdwUSER && fplog)
2502 "WARNING: using dispersion correction with user tables\n");
2505 /* Note that with LJ-PME, the dispersion correction is multiplied
2506 * by the difference between the actual C6 and the value of C6
2507 * that would produce the combination rule.
2508 * This means the normal energy and virial difference formulas
2512 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2514 /* Contribution beyond the cut-off */
2515 eners[0] += -4.0*M_PI/(3.0*rc3);
2516 eners[1] += 4.0*M_PI/(9.0*rc9);
2517 if (ic->vdw_modifier == eintmodPOTSHIFT)
2519 /* Contribution within the cut-off */
2520 eners[0] += -4.0*M_PI/(3.0*rc3);
2521 eners[1] += 4.0*M_PI/(3.0*rc9);
2523 /* Contribution beyond the cut-off */
2524 virs[0] += 8.0*M_PI/rc3;
2525 virs[1] += -16.0*M_PI/(3.0*rc9);
2530 "Dispersion correction is not implemented for vdw-type = %s",
2531 evdw_names[ic->vdwtype]);
2534 /* When we deprecate the group kernels the code below can go too */
2535 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2537 /* Calculate self-interaction coefficient (assuming that
2538 * the reciprocal-space contribution is constant in the
2539 * region that contributes to the self-interaction).
2541 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2543 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2544 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2547 fr->enerdiffsix = eners[0];
2548 fr->enerdifftwelve = eners[1];
2549 /* The 0.5 is due to the Gromacs definition of the virial */
2550 fr->virdiffsix = 0.5*virs[0];
2551 fr->virdifftwelve = 0.5*virs[1];
2555 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2556 const matrix box, real lambda, tensor pres, tensor virial,
2557 real *prescorr, real *enercorr, real *dvdlcorr)
2559 gmx_bool bCorrAll, bCorrPres;
2560 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2570 if (ir->eDispCorr != edispcNO)
2572 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2573 ir->eDispCorr == edispcAllEnerPres);
2574 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2575 ir->eDispCorr == edispcAllEnerPres);
2577 invvol = 1/det(box);
2580 /* Only correct for the interactions with the inserted molecule */
2581 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2586 dens = fr->numAtomsForDispersionCorrection*invvol;
2587 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2590 if (ir->efep == efepNO)
2592 avcsix = fr->avcsix[0];
2593 avctwelve = fr->avctwelve[0];
2597 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2598 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2601 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2602 *enercorr += avcsix*enerdiff;
2604 if (ir->efep != efepNO)
2606 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2610 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2611 *enercorr += avctwelve*enerdiff;
2612 if (fr->efep != efepNO)
2614 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2620 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2621 if (ir->eDispCorr == edispcAllEnerPres)
2623 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2625 /* The factor 2 is because of the Gromacs virial definition */
2626 spres = -2.0*invvol*svir*PRESFAC;
2628 for (m = 0; m < DIM; m++)
2630 virial[m][m] += svir;
2631 pres[m][m] += spres;
2636 /* Can't currently control when it prints, for now, just print when degugging */
2641 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2647 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2648 *enercorr, spres, svir);
2652 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2656 if (fr->efep != efepNO)
2658 *dvdlcorr += dvdlambda;
2663 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2664 const gmx_mtop_t *mtop, rvec x[],
2670 if (bFirst && fplog)
2672 fprintf(fplog, "Removing pbc first time\n");
2677 for (const gmx_molblock_t &molb : mtop->molblock)
2679 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2680 if (moltype.atoms.nr == 1 ||
2681 (!bFirst && moltype.cgs.nr == 1))
2683 /* Just one atom or charge group in the molecule, no PBC required */
2684 as += molb.nmol*moltype.atoms.nr;
2688 mk_graph_moltype(moltype, graph);
2690 for (mol = 0; mol < molb.nmol; mol++)
2692 mk_mshift(fplog, graph, ePBC, box, x+as);
2694 shift_self(graph, box, x+as);
2695 /* The molecule is whole now.
2696 * We don't need the second mk_mshift call as in do_pbc_first,
2697 * since we no longer need this graph.
2700 as += moltype.atoms.nr;
2708 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2709 const gmx_mtop_t *mtop, rvec x[])
2711 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2714 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2715 const gmx_mtop_t *mtop, rvec x[])
2717 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2720 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2723 nth = gmx_omp_nthreads_get(emntDefault);
2725 #pragma omp parallel for num_threads(nth) schedule(static)
2726 for (t = 0; t < nth; t++)
2730 size_t natoms = x.size();
2731 size_t offset = (natoms*t )/nth;
2732 size_t len = (natoms*(t + 1))/nth - offset;
2733 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2735 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2739 // TODO This can be cleaned up a lot, and move back to runner.cpp
2740 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2741 const t_inputrec *inputrec,
2742 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2743 gmx_walltime_accounting_t walltime_accounting,
2744 nonbonded_verlet_t *nbv,
2745 const gmx_pme_t *pme,
2746 gmx_bool bWriteStat)
2748 t_nrnb *nrnb_tot = nullptr;
2750 double nbfs = 0, mflop = 0;
2751 double elapsed_time,
2752 elapsed_time_over_all_ranks,
2753 elapsed_time_over_all_threads,
2754 elapsed_time_over_all_threads_over_all_ranks;
2755 /* Control whether it is valid to print a report. Only the
2756 simulation master may print, but it should not do so if the run
2757 terminated e.g. before a scheduled reset step. This is
2758 complicated by the fact that PME ranks are unaware of the
2759 reason why they were sent a pmerecvqxFINISH. To avoid
2760 communication deadlocks, we always do the communication for the
2761 report, even if we've decided not to write the report, because
2762 how long it takes to finish the run is not important when we've
2763 decided not to report on the simulation performance.
2765 Further, we only report performance for dynamical integrators,
2766 because those are the only ones for which we plan to
2767 consider doing any optimizations. */
2768 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2770 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2772 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2773 printReport = false;
2780 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2781 cr->mpi_comm_mysim);
2789 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2790 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2794 /* reduce elapsed_time over all MPI ranks in the current simulation */
2795 MPI_Allreduce(&elapsed_time,
2796 &elapsed_time_over_all_ranks,
2797 1, MPI_DOUBLE, MPI_SUM,
2798 cr->mpi_comm_mysim);
2799 elapsed_time_over_all_ranks /= cr->nnodes;
2800 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2801 * current simulation. */
2802 MPI_Allreduce(&elapsed_time_over_all_threads,
2803 &elapsed_time_over_all_threads_over_all_ranks,
2804 1, MPI_DOUBLE, MPI_SUM,
2805 cr->mpi_comm_mysim);
2810 elapsed_time_over_all_ranks = elapsed_time;
2811 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2816 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2823 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2825 print_dd_statistics(cr, inputrec, fplog);
2828 /* TODO Move the responsibility for any scaling by thread counts
2829 * to the code that handled the thread region, so that there's a
2830 * mechanism to keep cycle counting working during the transition
2831 * to task parallelism. */
2832 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2833 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2834 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2835 auto cycle_sum(wallcycle_sum(cr, wcycle));
2839 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2840 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2841 if (pme_gpu_task_enabled(pme))
2843 pme_gpu_get_timings(pme, &pme_gpu_timings);
2845 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2846 elapsed_time_over_all_ranks,
2851 if (EI_DYNAMICS(inputrec->eI))
2853 delta_t = inputrec->delta_t;
2858 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2859 elapsed_time_over_all_ranks,
2860 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2861 delta_t, nbfs, mflop);
2865 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2866 elapsed_time_over_all_ranks,
2867 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2868 delta_t, nbfs, mflop);
2873 void initialize_lambdas(FILE *fplog,
2874 const t_inputrec &ir,
2877 gmx::ArrayRef<real> lambda,
2880 /* TODO: Clean up initialization of fep_state and lambda in
2881 t_state. This function works, but could probably use a logic
2882 rewrite to keep all the different types of efep straight. */
2884 if ((ir.efep == efepNO) && (!ir.bSimTemp))
2889 const t_lambda *fep = ir.fepvals;
2892 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2893 if checkpoint is set -- a kludge is in for now
2897 for (int i = 0; i < efptNR; i++)
2900 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2901 if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
2903 thisLambda = fep->init_lambda;
2907 thisLambda = fep->all_lambda[i][fep->init_fep_state];
2911 lambda[i] = thisLambda;
2913 if (lam0 != nullptr)
2915 lam0[i] = thisLambda;
2920 /* need to rescale control temperatures to match current state */
2921 for (int i = 0; i < ir.opts.ngtc; i++)
2923 if (ir.opts.ref_t[i] > 0)
2925 ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
2930 /* Send to the log the information on the current lambdas */
2931 if (fplog != nullptr)
2933 fprintf(fplog, "Initial vector of lambda components:[ ");
2934 for (const auto &l : lambda)
2936 fprintf(fplog, "%10.4f ", l);
2938 fprintf(fplog, "]\n");