2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec_struct.h"
53 #include "gromacs/domdec/partition.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/gpubonded.h"
67 #include "gromacs/listed-forces/manage-threading.h"
68 #include "gromacs/listed-forces/orires.h"
69 #include "gromacs/math/arrayrefwithpadding.h"
70 #include "gromacs/math/functions.h"
71 #include "gromacs/math/units.h"
72 #include "gromacs/math/vec.h"
73 #include "gromacs/math/vecdump.h"
74 #include "gromacs/mdlib/calcmu.h"
75 #include "gromacs/mdlib/calcvir.h"
76 #include "gromacs/mdlib/constr.h"
77 #include "gromacs/mdlib/force.h"
78 #include "gromacs/mdlib/forcerec.h"
79 #include "gromacs/mdlib/gmx_omp_nthreads.h"
80 #include "gromacs/mdlib/mdrun.h"
81 #include "gromacs/mdlib/nb_verlet.h"
82 #include "gromacs/mdlib/nbnxn_atomdata.h"
83 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
84 #include "gromacs/mdlib/nbnxn_grid.h"
85 #include "gromacs/mdlib/nbnxn_search.h"
86 #include "gromacs/mdlib/ppforceworkload.h"
87 #include "gromacs/mdlib/qmmm.h"
88 #include "gromacs/mdlib/update.h"
89 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
90 #include "gromacs/mdtypes/commrec.h"
91 #include "gromacs/mdtypes/forceoutput.h"
92 #include "gromacs/mdtypes/iforceprovider.h"
93 #include "gromacs/mdtypes/inputrec.h"
94 #include "gromacs/mdtypes/md_enums.h"
95 #include "gromacs/mdtypes/state.h"
96 #include "gromacs/pbcutil/ishift.h"
97 #include "gromacs/pbcutil/mshift.h"
98 #include "gromacs/pbcutil/pbc.h"
99 #include "gromacs/pulling/pull.h"
100 #include "gromacs/pulling/pull_rotation.h"
101 #include "gromacs/timing/cyclecounter.h"
102 #include "gromacs/timing/gpu_timing.h"
103 #include "gromacs/timing/wallcycle.h"
104 #include "gromacs/timing/wallcyclereporting.h"
105 #include "gromacs/timing/walltime_accounting.h"
106 #include "gromacs/topology/topology.h"
107 #include "gromacs/utility/arrayref.h"
108 #include "gromacs/utility/basedefinitions.h"
109 #include "gromacs/utility/cstringutil.h"
110 #include "gromacs/utility/exceptions.h"
111 #include "gromacs/utility/fatalerror.h"
112 #include "gromacs/utility/gmxassert.h"
113 #include "gromacs/utility/gmxmpi.h"
114 #include "gromacs/utility/logger.h"
115 #include "gromacs/utility/pleasecite.h"
116 #include "gromacs/utility/smalloc.h"
117 #include "gromacs/utility/strconvert.h"
118 #include "gromacs/utility/sysinfo.h"
120 #include "nbnxn_gpu.h"
121 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
122 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
124 // TODO: this environment variable allows us to verify before release
125 // that on less common architectures the total cost of polling is not larger than
126 // a blocking wait (so polling does not introduce overhead when the static
127 // PME-first ordering would suffice).
128 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
131 void print_time(FILE *out,
132 gmx_walltime_accounting_t walltime_accounting,
134 const t_inputrec *ir,
138 double dt, elapsed_seconds, time_per_step;
147 fputs(gmx::int64ToString(step).c_str(), out);
150 if ((step >= ir->nstlist))
152 double seconds_since_epoch = gmx_gettime();
153 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
154 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
155 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
161 finish = static_cast<time_t>(seconds_since_epoch + dt);
162 auto timebuf = gmx_ctime_r(&finish);
163 timebuf.erase(timebuf.find_first_of('\n'));
164 fputs(", will finish ", out);
165 fputs(timebuf.c_str(), out);
169 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
174 fprintf(out, " performance: %.1f ns/day ",
175 ir->delta_t/1000*24*60*60/time_per_step);
184 GMX_UNUSED_VALUE(cr);
190 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
198 time_t temp_time = static_cast<time_t>(the_time);
200 auto timebuf = gmx_ctime_r(&temp_time);
202 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, timebuf.c_str());
205 void print_start(FILE *fplog, const t_commrec *cr,
206 gmx_walltime_accounting_t walltime_accounting,
211 sprintf(buf, "Started %s", name);
212 print_date_and_time(fplog, cr->nodeid, buf,
213 walltime_accounting_get_start_time_stamp(walltime_accounting));
216 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
218 const int end = forceToAdd.size();
220 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
221 #pragma omp parallel for num_threads(nt) schedule(static)
222 for (int i = 0; i < end; i++)
224 rvec_inc(f[i], forceToAdd[i]);
228 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
229 tensor vir_part, const t_graph *graph, const matrix box,
230 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
232 /* The short-range virial from surrounding boxes */
233 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
234 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
236 /* Calculate partial virial, for local atoms only, based on short range.
237 * Total virial is computed in global_stat, called from do_md
239 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
240 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
244 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
248 static void pull_potential_wrapper(const t_commrec *cr,
249 const t_inputrec *ir,
250 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
251 gmx::ForceWithVirial *force,
252 const t_mdatoms *mdatoms,
253 gmx_enerdata_t *enerd,
256 gmx_wallcycle_t wcycle)
261 /* Calculate the center of mass forces, this requires communication,
262 * which is why pull_potential is called close to other communication.
264 wallcycle_start(wcycle, ewcPULLPOT);
265 set_pbc(&pbc, ir->ePBC, box);
267 enerd->term[F_COM_PULL] +=
268 pull_potential(ir->pull_work, mdatoms, &pbc,
269 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
270 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
271 wallcycle_stop(wcycle, ewcPULLPOT);
274 static void pme_receive_force_ener(const t_commrec *cr,
275 gmx::ForceWithVirial *forceWithVirial,
276 gmx_enerdata_t *enerd,
277 gmx_wallcycle_t wcycle)
279 real e_q, e_lj, dvdl_q, dvdl_lj;
280 float cycles_ppdpme, cycles_seppme;
282 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
283 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
285 /* In case of node-splitting, the PP nodes receive the long-range
286 * forces, virial and energy from the PME nodes here.
288 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
291 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
293 enerd->term[F_COUL_RECIP] += e_q;
294 enerd->term[F_LJ_RECIP] += e_lj;
295 enerd->dvdl_lin[efptCOUL] += dvdl_q;
296 enerd->dvdl_lin[efptVDW] += dvdl_lj;
300 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
302 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
305 static void print_large_forces(FILE *fp,
313 real force2Tolerance = gmx::square(forceTolerance);
314 gmx::index numNonFinite = 0;
315 for (int i = 0; i < md->homenr; i++)
317 real force2 = norm2(f[i]);
318 bool nonFinite = !std::isfinite(force2);
319 if (force2 >= force2Tolerance || nonFinite)
321 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
323 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
330 if (numNonFinite > 0)
332 /* Note that with MPI this fatal call on one rank might interrupt
333 * the printing on other ranks. But we can only avoid that with
334 * an expensive MPI barrier that we would need at each step.
336 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
340 static void post_process_forces(const t_commrec *cr,
343 gmx_wallcycle_t wcycle,
344 const gmx_localtop_t *top,
348 gmx::ForceWithVirial *forceWithVirial,
350 const t_mdatoms *mdatoms,
351 const t_graph *graph,
352 const t_forcerec *fr,
353 const gmx_vsite_t *vsite,
356 if (fr->haveDirectVirialContributions)
358 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
362 /* Spread the mesh force on virtual sites to the other particles...
363 * This is parallellized. MPI communication is performed
364 * if the constructing atoms aren't local.
366 matrix virial = { { 0 } };
367 spread_vsite_f(vsite, x, fDirectVir, nullptr,
368 (flags & GMX_FORCE_VIRIAL) != 0, virial,
370 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
371 forceWithVirial->addVirialContribution(virial);
374 if (flags & GMX_FORCE_VIRIAL)
376 /* Now add the forces, this is local */
377 sum_forces(f, forceWithVirial->force_);
379 /* Add the direct virial contributions */
380 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
381 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
385 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
390 if (fr->print_force >= 0)
392 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
396 static void do_nb_verlet(const t_forcerec *fr,
397 const interaction_const_t *ic,
398 gmx_enerdata_t *enerd,
399 int flags, int ilocality,
403 gmx_wallcycle_t wcycle)
405 if (!(flags & GMX_FORCE_NONBONDED))
407 /* skip non-bonded calculation */
411 nonbonded_verlet_t *nbv = fr->nbv;
412 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
414 /* GPU kernel launch overhead is already timed separately */
415 if (fr->cutoff_scheme != ecutsVERLET)
417 gmx_incons("Invalid cut-off scheme passed!");
420 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
422 if (!bUsingGpuKernels)
424 /* When dynamic pair-list pruning is requested, we need to prune
425 * at nstlistPrune steps.
427 if (nbv->listParams->useDynamicPruning &&
428 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
430 /* Prune the pair-list beyond fr->ic->rlistPrune using
431 * the current coordinates of the atoms.
433 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
434 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
435 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
438 wallcycle_sub_start(wcycle, ewcsNONBONDED);
441 switch (nbvg->kernel_type)
443 case nbnxnk4x4_PlainC:
444 case nbnxnk4xN_SIMD_4xN:
445 case nbnxnk4xN_SIMD_2xNN:
446 nbnxn_kernel_cpu(nbvg,
453 enerd->grpp.ener[egCOULSR],
455 enerd->grpp.ener[egBHAMSR] :
456 enerd->grpp.ener[egLJSR]);
459 case nbnxnk8x8x8_GPU:
460 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
463 case nbnxnk8x8x8_PlainC:
464 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nblGpu[0],
471 enerd->grpp.ener[egCOULSR],
473 enerd->grpp.ener[egBHAMSR] :
474 enerd->grpp.ener[egLJSR]);
478 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
481 if (!bUsingGpuKernels)
483 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
486 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
487 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
489 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
491 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
492 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
494 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
498 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
500 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
501 if (flags & GMX_FORCE_ENERGY)
503 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
504 enr_nbnxn_kernel_ljc += 1;
505 enr_nbnxn_kernel_lj += 1;
508 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
509 nbvg->nbl_lists.natpair_ljq);
510 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
511 nbvg->nbl_lists.natpair_lj);
512 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
513 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
514 nbvg->nbl_lists.natpair_q);
516 if (ic->vdw_modifier == eintmodFORCESWITCH)
518 /* We add up the switch cost separately */
519 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
520 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
522 if (ic->vdw_modifier == eintmodPOTSWITCH)
524 /* We add up the switch cost separately */
525 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
526 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
528 if (ic->vdwtype == evdwPME)
530 /* We add up the LJ Ewald cost separately */
531 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
532 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
536 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
540 const t_mdatoms *mdatoms,
543 gmx_enerdata_t *enerd,
546 gmx_wallcycle_t wcycle)
549 nb_kernel_data_t kernel_data;
551 real dvdl_nb[efptNR];
556 /* Add short-range interactions */
557 donb_flags |= GMX_NONBONDED_DO_SR;
559 /* Currently all group scheme kernels always calculate (shift-)forces */
560 if (flags & GMX_FORCE_FORCES)
562 donb_flags |= GMX_NONBONDED_DO_FORCE;
564 if (flags & GMX_FORCE_VIRIAL)
566 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
568 if (flags & GMX_FORCE_ENERGY)
570 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
573 kernel_data.flags = donb_flags;
574 kernel_data.lambda = lambda;
575 kernel_data.dvdl = dvdl_nb;
577 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
578 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
580 /* reset free energy components */
581 for (i = 0; i < efptNR; i++)
586 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
588 wallcycle_sub_start(wcycle, ewcsNONBONDED);
589 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
590 for (th = 0; th < nbl_lists->nnbl; th++)
594 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
595 x, f, fr, mdatoms, &kernel_data, nrnb);
597 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
600 if (fepvals->sc_alpha != 0)
602 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
603 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
607 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
608 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
611 /* If we do foreign lambda and we have soft-core interactions
612 * we have to recalculate the (non-linear) energies contributions.
614 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
616 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
617 kernel_data.lambda = lam_i;
618 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
619 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
620 /* Note that we add to kernel_data.dvdl, but ignore the result */
622 for (i = 0; i < enerd->n_lambda; i++)
624 for (j = 0; j < efptNR; j++)
626 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
628 reset_foreign_enerdata(enerd);
629 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
630 for (th = 0; th < nbl_lists->nnbl; th++)
634 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
635 x, f, fr, mdatoms, &kernel_data, nrnb);
637 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
640 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
641 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
645 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
648 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
650 return nbv != nullptr && nbv->bUseGPU;
653 static inline void clear_rvecs_omp(int n, rvec v[])
655 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
657 /* Note that we would like to avoid this conditional by putting it
658 * into the omp pragma instead, but then we still take the full
659 * omp parallel for overhead (at least with gcc5).
663 for (int i = 0; i < n; i++)
670 #pragma omp parallel for num_threads(nth) schedule(static)
671 for (int i = 0; i < n; i++)
678 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
680 * \param groupOptions Group options, containing T-coupling options
682 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
684 real nrdfCoupled = 0;
685 real nrdfUncoupled = 0;
686 real kineticEnergy = 0;
687 for (int g = 0; g < groupOptions.ngtc; g++)
689 if (groupOptions.tau_t[g] >= 0)
691 nrdfCoupled += groupOptions.nrdf[g];
692 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
696 nrdfUncoupled += groupOptions.nrdf[g];
700 /* This conditional with > also catches nrdf=0 */
701 if (nrdfCoupled > nrdfUncoupled)
703 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
711 /*! \brief This routine checks that the potential energy is finite.
713 * Always checks that the potential energy is finite. If step equals
714 * inputrec.init_step also checks that the magnitude of the potential energy
715 * is reasonable. Terminates with a fatal error when a check fails.
716 * Note that passing this check does not guarantee finite forces,
717 * since those use slightly different arithmetics. But in most cases
718 * there is just a narrow coordinate range where forces are not finite
719 * and energies are finite.
721 * \param[in] step The step number, used for checking and printing
722 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
723 * \param[in] inputrec The input record
725 static void checkPotentialEnergyValidity(int64_t step,
726 const gmx_enerdata_t &enerd,
727 const t_inputrec &inputrec)
729 /* Threshold valid for comparing absolute potential energy against
730 * the kinetic energy. Normally one should not consider absolute
731 * potential energy values, but with a factor of one million
732 * we should never get false positives.
734 constexpr real c_thresholdFactor = 1e6;
736 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
737 real averageKineticEnergy = 0;
738 /* We only check for large potential energy at the initial step,
739 * because that is by far the most likely step for this too occur
740 * and because computing the average kinetic energy is not free.
741 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
742 * before they become NaN.
744 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
746 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
749 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
750 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
752 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
755 energyIsNotFinite ? "not finite" : "extremely high",
757 enerd.term[F_COUL_SR],
758 energyIsNotFinite ? "non-finite" : "very high",
759 energyIsNotFinite ? " or Nan" : "");
763 /*! \brief Compute forces and/or energies for special algorithms
765 * The intention is to collect all calls to algorithms that compute
766 * forces on local atoms only and that do not contribute to the local
767 * virial sum (but add their virial contribution separately).
768 * Eventually these should likely all become ForceProviders.
769 * Within this function the intention is to have algorithms that do
770 * global communication at the end, so global barriers within the MD loop
771 * are as close together as possible.
773 * \param[in] fplog The log file
774 * \param[in] cr The communication record
775 * \param[in] inputrec The input record
776 * \param[in] awh The Awh module (nullptr if none in use).
777 * \param[in] enforcedRotation Enforced rotation module.
778 * \param[in] step The current MD step
779 * \param[in] t The current time
780 * \param[in,out] wcycle Wallcycle accounting struct
781 * \param[in,out] forceProviders Pointer to a list of force providers
782 * \param[in] box The unit cell
783 * \param[in] x The coordinates
784 * \param[in] mdatoms Per atom properties
785 * \param[in] lambda Array of free-energy lambda values
786 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
787 * \param[in,out] forceWithVirial Force and virial buffers
788 * \param[in,out] enerd Energy buffer
789 * \param[in,out] ed Essential dynamics pointer
790 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
792 * \todo Remove bNS, which is used incorrectly.
793 * \todo Convert all other algorithms called here to ForceProviders.
796 computeSpecialForces(FILE *fplog,
798 const t_inputrec *inputrec,
800 gmx_enfrot *enforcedRotation,
803 gmx_wallcycle_t wcycle,
804 ForceProviders *forceProviders,
806 gmx::ArrayRef<const gmx::RVec> x,
807 const t_mdatoms *mdatoms,
810 gmx::ForceWithVirial *forceWithVirial,
811 gmx_enerdata_t *enerd,
815 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
817 /* NOTE: Currently all ForceProviders only provide forces.
818 * When they also provide energies, remove this conditional.
822 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
823 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
825 /* Collect forces from modules */
826 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
829 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
831 pull_potential_wrapper(cr, inputrec, box, x,
833 mdatoms, enerd, lambda, t,
838 enerd->term[F_COM_PULL] +=
839 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
841 t, step, wcycle, fplog);
845 rvec *f = as_rvec_array(forceWithVirial->force_.data());
847 /* Add the forces from enforced rotation potentials (if any) */
850 wallcycle_start(wcycle, ewcROTadd);
851 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
852 wallcycle_stop(wcycle, ewcROTadd);
857 /* Note that since init_edsam() is called after the initialization
858 * of forcerec, edsam doesn't request the noVirSum force buffer.
859 * Thus if no other algorithm (e.g. PME) requires it, the forces
860 * here will contribute to the virial.
862 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
865 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
866 if (inputrec->bIMD && computeForces)
868 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
872 /*! \brief Launch the prepare_step and spread stages of PME GPU.
874 * \param[in] pmedata The PME structure
875 * \param[in] box The box matrix
876 * \param[in] x Coordinate array
877 * \param[in] flags Force flags
878 * \param[in] pmeFlags PME flags
879 * \param[in] wcycle The wallcycle structure
881 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
886 gmx_wallcycle_t wcycle)
888 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
889 pme_gpu_launch_spread(pmedata, x, wcycle);
892 /*! \brief Launch the FFT and gather stages of PME GPU
894 * This function only implements setting the output forces (no accumulation).
896 * \param[in] pmedata The PME structure
897 * \param[in] wcycle The wallcycle structure
899 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
900 gmx_wallcycle_t wcycle)
902 pme_gpu_launch_complex_transforms(pmedata, wcycle);
903 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
907 * Polling wait for either of the PME or nonbonded GPU tasks.
909 * Instead of a static order in waiting for GPU tasks, this function
910 * polls checking which of the two tasks completes first, and does the
911 * associated force buffer reduction overlapped with the other task.
912 * By doing that, unlike static scheduling order, it can always overlap
913 * one of the reductions, regardless of the GPU task completion order.
915 * \param[in] nbv Nonbonded verlet structure
916 * \param[in,out] pmedata PME module data
917 * \param[in,out] force Force array to reduce task outputs into.
918 * \param[in,out] forceWithVirial Force and virial buffers
919 * \param[in,out] fshift Shift force output vector results are reduced into
920 * \param[in,out] enerd Energy data structure results are reduced into
921 * \param[in] flags Force flags
922 * \param[in] pmeFlags PME flags
923 * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
924 * \param[in] wcycle The wallcycle structure
926 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
928 gmx::ArrayRefWithPadding<gmx::RVec> *force,
929 gmx::ForceWithVirial *forceWithVirial,
931 gmx_enerdata_t *enerd,
935 gmx_wallcycle_t wcycle)
937 bool isPmeGpuDone = false;
938 bool isNbGpuDone = false;
941 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
943 while (!isPmeGpuDone || !isNbGpuDone)
947 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
948 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
953 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
954 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
955 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
958 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
959 fshift, completionType);
960 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
961 // To get the call count right, when the task finished we
962 // issue a start/stop.
963 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
964 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
967 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
968 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
970 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
971 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
978 * Launch the dynamic rolling pruning GPU task.
980 * We currently alternate local/non-local list pruning in odd-even steps
981 * (only pruning every second step without DD).
983 * \param[in] cr The communication record
984 * \param[in] nbv Nonbonded verlet structure
985 * \param[in] inputrec The input record
986 * \param[in] step The current MD step
988 static inline void launchGpuRollingPruning(const t_commrec *cr,
989 const nonbonded_verlet_t *nbv,
990 const t_inputrec *inputrec,
993 /* We should not launch the rolling pruning kernel at a search
994 * step or just before search steps, since that's useless.
995 * Without domain decomposition we prune at even steps.
996 * With domain decomposition we alternate local and non-local
997 * pruning at even and odd steps.
999 int numRollingParts = nbv->listParams->numRollingParts;
1000 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1001 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1002 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1003 if (stepWithCurrentList > 0 &&
1004 stepWithCurrentList < inputrec->nstlist - 1 &&
1005 (stepIsEven || DOMAINDECOMP(cr)))
1007 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1008 stepIsEven ? eintLocal : eintNonlocal,
1013 static void do_force_cutsVERLET(FILE *fplog,
1014 const t_commrec *cr,
1015 const gmx_multisim_t *ms,
1016 const t_inputrec *inputrec,
1018 gmx_enfrot *enforcedRotation,
1021 gmx_wallcycle_t wcycle,
1022 const gmx_localtop_t *top,
1023 const gmx_groups_t * /* groups */,
1024 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1026 gmx::ArrayRefWithPadding<gmx::RVec> force,
1028 const t_mdatoms *mdatoms,
1029 gmx_enerdata_t *enerd, t_fcdata *fcd,
1033 gmx::PpForceWorkload *ppForceWorkload,
1034 interaction_const_t *ic,
1035 const gmx_vsite_t *vsite,
1040 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1041 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1045 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1046 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1047 rvec vzero, box_diag;
1048 float cycles_pme, cycles_wait_gpu;
1049 nonbonded_verlet_t *nbv = fr->nbv;
1051 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1052 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1053 bFillGrid = (bNS && bStateChanged);
1054 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1055 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1056 bUseGPU = fr->nbv->bUseGPU;
1057 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1059 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1060 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1061 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1062 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1063 const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
1064 ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
1065 ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
1066 ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
1068 /* At a search step we need to start the first balancing region
1069 * somewhere early inside the step after communication during domain
1070 * decomposition (and not during the previous step as usual).
1073 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1075 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1078 cycles_wait_gpu = 0;
1080 const int start = 0;
1081 const int homenr = mdatoms->homenr;
1083 clear_mat(vir_force);
1085 if (DOMAINDECOMP(cr))
1087 cg1 = cr->dd->globalAtomGroupIndices.size();
1100 update_forcerec(fr, box);
1102 if (inputrecNeedMutot(inputrec))
1104 /* Calculate total (local) dipole moment in a temporary common array.
1105 * This makes it possible to sum them over nodes faster.
1107 calc_mu(start, homenr,
1108 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1113 if (fr->ePBC != epbcNONE)
1115 /* Compute shift vectors every step,
1116 * because of pressure coupling or box deformation!
1118 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1120 calc_shifts(box, fr->shift_vec);
1125 put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr));
1126 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1128 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1130 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1134 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1135 fr->shift_vec, nbv->nbat);
1138 if (!thisRankHasDuty(cr, DUTY_PME))
1140 /* Send particle coordinates to the pme nodes.
1141 * Since this is only implemented for domain decomposition
1142 * and domain decomposition does not use the graph,
1143 * we do not need to worry about shifting.
1145 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1146 lambda[efptCOUL], lambda[efptVDW],
1147 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1150 #endif /* GMX_MPI */
1154 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), flags, pmeFlags, wcycle);
1157 /* do gridding for pair search */
1160 if (graph && bStateChanged)
1162 /* Calculate intramolecular shift vectors to make molecules whole */
1163 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1167 box_diag[XX] = box[XX][XX];
1168 box_diag[YY] = box[YY][YY];
1169 box_diag[ZZ] = box[ZZ][ZZ];
1171 wallcycle_start(wcycle, ewcNS);
1172 if (!DOMAINDECOMP(cr))
1174 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1175 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1177 nullptr, 0, mdatoms->homenr, -1,
1178 fr->cginfo, x.unpaddedArrayRef(),
1180 nbv->grp[eintLocal].kernel_type,
1182 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1186 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1187 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1188 fr->cginfo, x.unpaddedArrayRef(),
1189 nbv->grp[eintNonlocal].kernel_type,
1191 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1194 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1196 wallcycle_stop(wcycle, ewcNS);
1199 /* initialize the GPU atom data and copy shift vector */
1202 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1203 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1207 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1210 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1212 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1214 if (bNS && fr->gpuBonded)
1216 /* Now we put all atoms on the grid, we can assign bonded
1217 * interactions to the GPU, where the grid order is
1218 * needed. Also the xq, f and fshift device buffers have
1219 * been reallocated if needed, so the bonded code can
1220 * learn about them. */
1221 // TODO the xq, f, and fshift buffers are now shared
1222 // resources, so they should be maintained by a
1223 // higher-level object than the nb module.
1224 fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
1226 nbnxn_gpu_get_xq(nbv->gpu_nbv),
1227 nbnxn_gpu_get_f(nbv->gpu_nbv),
1228 nbnxn_gpu_get_fshift(nbv->gpu_nbv));
1229 ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
1232 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1235 /* do local pair search */
1238 wallcycle_start_nocount(wcycle, ewcNS);
1239 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1240 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1242 nbv->listParams->rlistOuter,
1243 nbv->min_ci_balanced,
1244 &nbv->grp[eintLocal].nbl_lists,
1246 nbv->grp[eintLocal].kernel_type,
1248 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1249 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1251 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1253 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1257 /* initialize local pair-list on the GPU */
1258 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1259 nbv->grp[eintLocal].nbl_lists.nblGpu[0],
1262 wallcycle_stop(wcycle, ewcNS);
1266 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1272 if (DOMAINDECOMP(cr))
1274 ddOpenBalanceRegionGpu(cr->dd);
1277 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1279 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1280 nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
1281 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1283 // bonded work not split into separate local and non-local, so with DD
1284 // we can only launch the kernel after non-local coordinates have been received.
1285 if (ppForceWorkload->haveGpuBondedWork && !DOMAINDECOMP(cr))
1287 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1288 fr->gpuBonded->launchKernels(fr, flags, box);
1289 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1292 /* launch local nonbonded work on GPU */
1293 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1294 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1295 step, nrnb, wcycle);
1296 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1297 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1302 // In PME GPU and mixed mode we launch FFT / gather after the
1303 // X copy/transform to allow overlap as well as after the GPU NB
1304 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1305 // the nonbonded kernel.
1306 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1309 /* Communicate coordinates and sum dipole if necessary +
1310 do non-local pair search */
1311 if (DOMAINDECOMP(cr))
1315 wallcycle_start_nocount(wcycle, ewcNS);
1316 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1318 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1320 nbv->listParams->rlistOuter,
1321 nbv->min_ci_balanced,
1322 &nbv->grp[eintNonlocal].nbl_lists,
1324 nbv->grp[eintNonlocal].kernel_type,
1326 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1327 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1329 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1331 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1333 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1335 /* initialize non-local pair-list on the GPU */
1336 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1337 nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
1340 wallcycle_stop(wcycle, ewcNS);
1344 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1346 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
1352 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1354 /* launch non-local nonbonded tasks on GPU */
1355 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1356 nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1357 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1359 if (ppForceWorkload->haveGpuBondedWork)
1361 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
1362 fr->gpuBonded->launchKernels(fr, flags, box);
1363 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1366 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1367 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1368 step, nrnb, wcycle);
1369 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1371 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1377 /* launch D2H copy-back F */
1378 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1379 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1380 if (DOMAINDECOMP(cr))
1382 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1383 flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
1385 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1386 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
1387 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1389 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1391 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1392 fr->gpuBonded->launchEnergyTransfer();
1393 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1395 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1398 if (bStateChanged && inputrecNeedMutot(inputrec))
1402 gmx_sumd(2*DIM, mu, cr);
1403 ddReopenBalanceRegionCpu(cr->dd);
1406 for (i = 0; i < 2; i++)
1408 for (j = 0; j < DIM; j++)
1410 fr->mu_tot[i][j] = mu[i*DIM + j];
1414 if (fr->efep == efepNO)
1416 copy_rvec(fr->mu_tot[0], mu_tot);
1420 for (j = 0; j < DIM; j++)
1423 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1424 lambda[efptCOUL]*fr->mu_tot[1][j];
1428 /* Reset energies */
1429 reset_enerdata(enerd);
1430 clear_rvecs(SHIFTS, fr->fshift);
1432 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1434 wallcycle_start(wcycle, ewcPPDURINGPME);
1435 dd_force_flop_start(cr->dd, nrnb);
1440 wallcycle_start(wcycle, ewcROT);
1441 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1442 wallcycle_stop(wcycle, ewcROT);
1445 /* Temporary solution until all routines take PaddedRVecVector */
1446 rvec *const f = as_rvec_array(force.unpaddedArrayRef().data());
1448 /* Start the force cycle counter.
1449 * Note that a different counter is used for dynamic load balancing.
1451 wallcycle_start(wcycle, ewcFORCE);
1453 gmx::ArrayRef<gmx::RVec> forceRef = force.unpaddedArrayRef();
1456 /* If we need to compute the virial, we might need a separate
1457 * force buffer for algorithms for which the virial is calculated
1458 * directly, such as PME.
1460 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1462 forceRef = *fr->forceBufferForDirectVirialContributions;
1464 /* TODO: update comment
1465 * We only compute forces on local atoms. Note that vsites can
1466 * spread to non-local atoms, but that part of the buffer is
1467 * cleared separately in the vsite spreading code.
1469 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1471 /* Clear the short- and long-range forces */
1472 clear_rvecs_omp(fr->natoms_force_constr, f);
1475 /* forceWithVirial uses the local atom range only */
1476 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1478 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1480 clear_pull_forces(inputrec->pull_work);
1483 /* We calculate the non-bonded forces, when done on the CPU, here.
1484 * We do this before calling do_force_lowlevel, because in that
1485 * function, the listed forces are calculated before PME, which
1486 * does communication. With this order, non-bonded and listed
1487 * force calculation imbalance can be balanced out by the domain
1488 * decomposition load balancing.
1493 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1494 step, nrnb, wcycle);
1497 if (fr->efep != efepNO)
1499 /* Calculate the local and non-local free energy interactions here.
1500 * Happens here on the CPU both with and without GPU.
1502 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1504 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1505 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1506 inputrec->fepvals, lambda,
1507 enerd, flags, nrnb, wcycle);
1510 if (DOMAINDECOMP(cr) &&
1511 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1513 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1514 fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
1515 inputrec->fepvals, lambda,
1516 enerd, flags, nrnb, wcycle);
1524 if (DOMAINDECOMP(cr))
1526 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1527 step, nrnb, wcycle);
1536 aloc = eintNonlocal;
1539 /* Add all the non-bonded force to the normal force array.
1540 * This can be split into a local and a non-local part when overlapping
1541 * communication with calculation with domain decomposition.
1543 wallcycle_stop(wcycle, ewcFORCE);
1545 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1547 wallcycle_start_nocount(wcycle, ewcFORCE);
1549 /* if there are multiple fshift output buffers reduce them */
1550 if ((flags & GMX_FORCE_VIRIAL) &&
1551 nbv->grp[aloc].nbl_lists.nnbl > 1)
1553 /* This is not in a subcounter because it takes a
1554 negligible and constant-sized amount of time */
1555 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1560 /* update QMMMrec, if necessary */
1563 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
1566 /* Compute the bonded and non-bonded energies and optionally forces */
1567 do_force_lowlevel(fr, inputrec, &(top->idef),
1568 cr, ms, nrnb, wcycle, mdatoms,
1569 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
1570 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1571 flags, &cycles_pme);
1573 wallcycle_stop(wcycle, ewcFORCE);
1575 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1577 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
1578 flags, &forceWithVirial, enerd,
1583 /* wait for non-local forces (or calculate in emulation mode) */
1584 if (DOMAINDECOMP(cr))
1588 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1589 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1591 ppForceWorkload->haveGpuBondedWork,
1592 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1594 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1598 wallcycle_start_nocount(wcycle, ewcFORCE);
1599 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1600 step, nrnb, wcycle);
1601 wallcycle_stop(wcycle, ewcFORCE);
1604 /* skip the reduction if there was no non-local work to do */
1605 if (nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->nsci > 0)
1607 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1608 nbv->nbat, f, wcycle);
1613 if (DOMAINDECOMP(cr))
1615 /* We are done with the CPU compute.
1616 * We will now communicate the non-local forces.
1617 * If we use a GPU this will overlap with GPU work, so in that case
1618 * we do not close the DD force balancing region here.
1620 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1622 ddCloseBalanceRegionCpu(cr->dd);
1626 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
1630 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1631 // an alternating wait/reduction scheme.
1632 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1633 if (alternateGpuWait)
1635 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
1638 if (!alternateGpuWait && useGpuPme)
1640 pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceWithVirial, enerd);
1643 /* Wait for local GPU NB outputs on the non-alternating wait path */
1644 if (!alternateGpuWait && bUseGPU)
1646 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1647 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1648 * but even with a step of 0.1 ms the difference is less than 1%
1651 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1653 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1654 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1655 flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
1656 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1658 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1660 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1662 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1663 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1665 /* We measured few cycles, it could be that the kernel
1666 * and transfer finished earlier and there was no actual
1667 * wait time, only API call overhead.
1668 * Then the actual time could be anywhere between 0 and
1669 * cycles_wait_est. We will use half of cycles_wait_est.
1671 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1673 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1677 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1679 // NOTE: emulation kernel is not included in the balancing region,
1680 // but emulation mode does not target performance anyway
1681 wallcycle_start_nocount(wcycle, ewcFORCE);
1682 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1683 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1684 step, nrnb, wcycle);
1685 wallcycle_stop(wcycle, ewcFORCE);
1690 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1695 /* now clear the GPU outputs while we finish the step on the CPU */
1696 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1697 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1698 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1700 /* Is dynamic pair-list pruning activated? */
1701 if (nbv->listParams->useDynamicPruning)
1703 launchGpuRollingPruning(cr, nbv, inputrec, step);
1705 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1706 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1709 if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
1711 wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
1712 // in principle this should be included in the DD balancing region,
1713 // but generally it is infrequent so we'll omit it for the sake of
1715 fr->gpuBonded->accumulateEnergyTerms(enerd);
1716 wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
1718 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1719 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
1720 fr->gpuBonded->clearEnergies();
1721 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
1722 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1725 /* Do the nonbonded GPU (or emulation) force buffer reduction
1726 * on the non-alternating path. */
1727 if (bUseOrEmulGPU && !alternateGpuWait)
1729 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1730 nbv->nbat, f, wcycle);
1732 if (DOMAINDECOMP(cr))
1734 dd_force_flop_stop(cr->dd, nrnb);
1739 /* If we have NoVirSum forces, but we do not calculate the virial,
1740 * we sum fr->f_novirsum=f later.
1742 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1744 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
1745 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1748 if (flags & GMX_FORCE_VIRIAL)
1750 /* Calculation of the virial must be done after vsites! */
1751 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
1752 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1756 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1758 /* In case of node-splitting, the PP nodes receive the long-range
1759 * forces, virial and energy from the PME nodes here.
1761 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1766 post_process_forces(cr, step, nrnb, wcycle,
1767 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
1768 vir_force, mdatoms, graph, fr, vsite,
1772 if (flags & GMX_FORCE_ENERGY)
1774 /* Sum the potential energy terms from group contributions */
1775 sum_epot(&(enerd->grpp), enerd->term);
1777 if (!EI_TPI(inputrec->eI))
1779 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1784 static void do_force_cutsGROUP(FILE *fplog,
1785 const t_commrec *cr,
1786 const gmx_multisim_t *ms,
1787 const t_inputrec *inputrec,
1789 gmx_enfrot *enforcedRotation,
1792 gmx_wallcycle_t wcycle,
1793 gmx_localtop_t *top,
1794 const gmx_groups_t *groups,
1795 matrix box, gmx::ArrayRefWithPadding<gmx::RVec> x,
1797 gmx::ArrayRefWithPadding<gmx::RVec> force,
1799 const t_mdatoms *mdatoms,
1800 gmx_enerdata_t *enerd,
1805 const gmx_vsite_t *vsite,
1810 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1811 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1815 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1819 const int start = 0;
1820 const int homenr = mdatoms->homenr;
1822 clear_mat(vir_force);
1825 if (DOMAINDECOMP(cr))
1827 cg1 = cr->dd->globalAtomGroupIndices.size();
1838 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1839 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1840 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1841 bFillGrid = (bNS && bStateChanged);
1842 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1843 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1847 update_forcerec(fr, box);
1849 if (inputrecNeedMutot(inputrec))
1851 /* Calculate total (local) dipole moment in a temporary common array.
1852 * This makes it possible to sum them over nodes faster.
1854 calc_mu(start, homenr,
1855 x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1860 if (fr->ePBC != epbcNONE)
1862 /* Compute shift vectors every step,
1863 * because of pressure coupling or box deformation!
1865 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1867 calc_shifts(box, fr->shift_vec);
1872 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1873 &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1874 inc_nrnb(nrnb, eNR_CGCM, homenr);
1875 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1877 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1879 unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
1884 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.unpaddedArrayRef().data()), fr->cg_cm);
1885 inc_nrnb(nrnb, eNR_CGCM, homenr);
1888 if (bCalcCGCM && gmx_debug_at)
1890 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1894 if (!thisRankHasDuty(cr, DUTY_PME))
1896 /* Send particle coordinates to the pme nodes.
1897 * Since this is only implemented for domain decomposition
1898 * and domain decomposition does not use the graph,
1899 * we do not need to worry about shifting.
1901 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
1902 lambda[efptCOUL], lambda[efptVDW],
1903 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1906 #endif /* GMX_MPI */
1908 /* Communicate coordinates and sum dipole if necessary */
1909 if (DOMAINDECOMP(cr))
1911 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1913 /* No GPU support, no move_x overlap, so reopen the balance region here */
1914 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1916 ddReopenBalanceRegionCpu(cr->dd);
1920 if (inputrecNeedMutot(inputrec))
1926 gmx_sumd(2*DIM, mu, cr);
1927 ddReopenBalanceRegionCpu(cr->dd);
1929 for (i = 0; i < 2; i++)
1931 for (j = 0; j < DIM; j++)
1933 fr->mu_tot[i][j] = mu[i*DIM + j];
1937 if (fr->efep == efepNO)
1939 copy_rvec(fr->mu_tot[0], mu_tot);
1943 for (j = 0; j < DIM; j++)
1946 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1951 /* Reset energies */
1952 reset_enerdata(enerd);
1953 clear_rvecs(SHIFTS, fr->fshift);
1957 wallcycle_start(wcycle, ewcNS);
1959 if (graph && bStateChanged)
1961 /* Calculate intramolecular shift vectors to make molecules whole */
1962 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
1965 /* Do the actual neighbour searching */
1967 groups, top, mdatoms,
1968 cr, nrnb, bFillGrid);
1970 wallcycle_stop(wcycle, ewcNS);
1973 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1975 wallcycle_start(wcycle, ewcPPDURINGPME);
1976 dd_force_flop_start(cr->dd, nrnb);
1981 wallcycle_start(wcycle, ewcROT);
1982 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, bNS);
1983 wallcycle_stop(wcycle, ewcROT);
1986 /* Temporary solution until all routines take PaddedRVecVector */
1987 rvec *f = as_rvec_array(force.unpaddedArrayRef().data());
1989 /* Start the force cycle counter.
1990 * Note that a different counter is used for dynamic load balancing.
1992 wallcycle_start(wcycle, ewcFORCE);
1994 gmx::ArrayRef<gmx::RVec> forceRef = force.paddedArrayRef();
1997 /* If we need to compute the virial, we might need a separate
1998 * force buffer for algorithms for which the virial is calculated
1999 * separately, such as PME.
2001 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
2003 forceRef = *fr->forceBufferForDirectVirialContributions;
2005 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
2008 /* Clear the short- and long-range forces */
2009 clear_rvecs(fr->natoms_force_constr, f);
2012 /* forceWithVirial might need the full force atom range */
2013 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
2015 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
2017 clear_pull_forces(inputrec->pull_work);
2020 /* update QMMMrec, if necessary */
2023 update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
2026 /* Compute the bonded and non-bonded energies and optionally forces */
2027 do_force_lowlevel(fr, inputrec, &(top->idef),
2028 cr, ms, nrnb, wcycle, mdatoms,
2029 as_rvec_array(x.unpaddedArrayRef().data()), hist, f, &forceWithVirial, enerd, fcd,
2030 box, inputrec->fepvals, lambda,
2031 graph, &(top->excls), fr->mu_tot,
2035 wallcycle_stop(wcycle, ewcFORCE);
2037 if (DOMAINDECOMP(cr))
2039 dd_force_flop_stop(cr->dd, nrnb);
2041 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2043 ddCloseBalanceRegionCpu(cr->dd);
2047 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2049 fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
2050 flags, &forceWithVirial, enerd,
2055 /* Communicate the forces */
2056 if (DOMAINDECOMP(cr))
2058 dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
2059 /* Do we need to communicate the separate force array
2060 * for terms that do not contribute to the single sum virial?
2061 * Position restraints and electric fields do not introduce
2062 * inter-cg forces, only full electrostatics methods do.
2063 * When we do not calculate the virial, fr->f_novirsum = f,
2064 * so we have already communicated these forces.
2066 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2067 (flags & GMX_FORCE_VIRIAL))
2069 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2073 /* If we have NoVirSum forces, but we do not calculate the virial,
2074 * we sum fr->f_novirsum=f later.
2076 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2078 spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fr->fshift, FALSE, nullptr, nrnb,
2079 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2082 if (flags & GMX_FORCE_VIRIAL)
2084 /* Calculation of the virial must be done after vsites! */
2085 calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), f,
2086 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2090 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2092 /* In case of node-splitting, the PP nodes receive the long-range
2093 * forces, virial and energy from the PME nodes here.
2095 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2100 post_process_forces(cr, step, nrnb, wcycle,
2101 top, box, as_rvec_array(x.unpaddedArrayRef().data()), f, &forceWithVirial,
2102 vir_force, mdatoms, graph, fr, vsite,
2106 if (flags & GMX_FORCE_ENERGY)
2108 /* Sum the potential energy terms from group contributions */
2109 sum_epot(&(enerd->grpp), enerd->term);
2111 if (!EI_TPI(inputrec->eI))
2113 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2119 void do_force(FILE *fplog,
2120 const t_commrec *cr,
2121 const gmx_multisim_t *ms,
2122 const t_inputrec *inputrec,
2124 gmx_enfrot *enforcedRotation,
2127 gmx_wallcycle_t wcycle,
2128 gmx_localtop_t *top,
2129 const gmx_groups_t *groups,
2131 gmx::ArrayRefWithPadding<gmx::RVec> x, //NOLINT(performance-unnecessary-value-param)
2133 gmx::ArrayRefWithPadding<gmx::RVec> force, //NOLINT(performance-unnecessary-value-param)
2135 const t_mdatoms *mdatoms,
2136 gmx_enerdata_t *enerd,
2138 gmx::ArrayRef<real> lambda,
2141 gmx::PpForceWorkload *ppForceWorkload,
2142 const gmx_vsite_t *vsite,
2147 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2148 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2150 /* modify force flag if not doing nonbonded */
2151 if (!fr->bNonbonded)
2153 flags &= ~GMX_FORCE_NONBONDED;
2156 switch (inputrec->cutoff_scheme)
2159 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2160 awh, enforcedRotation, step, nrnb, wcycle,
2167 lambda.data(), graph,
2174 ddOpenBalanceRegion,
2175 ddCloseBalanceRegion);
2178 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2179 awh, enforcedRotation, step, nrnb, wcycle,
2186 lambda.data(), graph,
2190 ddOpenBalanceRegion,
2191 ddCloseBalanceRegion);
2194 gmx_incons("Invalid cut-off scheme passed!");
2197 /* In case we don't have constraints and are using GPUs, the next balancing
2198 * region starts here.
2199 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2200 * virial calculation and COM pulling, is not thus not included in
2201 * the balance timing, which is ok as most tasks do communication.
2203 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2205 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2210 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2211 const t_inputrec *ir, const t_mdatoms *md,
2214 int i, m, start, end;
2216 real dt = ir->delta_t;
2220 /* We need to allocate one element extra, since we might use
2221 * (unaligned) 4-wide SIMD loads to access rvec entries.
2223 snew(savex, state->natoms + 1);
2230 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2231 start, md->homenr, end);
2233 /* Do a first constrain to reset particles... */
2234 step = ir->init_step;
2237 char buf[STEPSTRSIZE];
2238 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2239 gmx_step_str(step, buf));
2243 /* constrain the current position */
2244 constr->apply(TRUE, FALSE,
2246 state->x.rvec_array(), state->x.rvec_array(), nullptr,
2248 state->lambda[efptBONDED], &dvdl_dum,
2249 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2252 /* constrain the inital velocity, and save it */
2253 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2254 constr->apply(TRUE, FALSE,
2256 state->x.rvec_array(), state->v.rvec_array(), state->v.rvec_array(),
2258 state->lambda[efptBONDED], &dvdl_dum,
2259 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2261 /* constrain the inital velocities at t-dt/2 */
2262 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2264 auto x = makeArrayRef(state->x).subArray(start, end);
2265 auto v = makeArrayRef(state->v).subArray(start, end);
2266 for (i = start; (i < end); i++)
2268 for (m = 0; (m < DIM); m++)
2270 /* Reverse the velocity */
2272 /* Store the position at t-dt in buf */
2273 savex[i][m] = x[i][m] + dt*v[i][m];
2276 /* Shake the positions at t=-dt with the positions at t=0
2277 * as reference coordinates.
2281 char buf[STEPSTRSIZE];
2282 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2283 gmx_step_str(step, buf));
2286 constr->apply(TRUE, FALSE,
2288 state->x.rvec_array(), savex, nullptr,
2290 state->lambda[efptBONDED], &dvdl_dum,
2291 state->v.rvec_array(), nullptr, gmx::ConstraintVariable::Positions);
2293 for (i = start; i < end; i++)
2295 for (m = 0; m < DIM; m++)
2297 /* Re-reverse the velocities */
2307 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2308 double *enerout, double *virout)
2310 double enersum, virsum;
2311 double invscale, invscale2, invscale3;
2312 double r, ea, eb, ec, pa, pb, pc, pd;
2317 invscale = 1.0/scale;
2318 invscale2 = invscale*invscale;
2319 invscale3 = invscale*invscale2;
2321 /* Following summation derived from cubic spline definition,
2322 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2323 * the cubic spline. We first calculate the negative of the
2324 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2325 * add the more standard, abrupt cutoff correction to that result,
2326 * yielding the long-range correction for a switched function. We
2327 * perform both the pressure and energy loops at the same time for
2328 * simplicity, as the computational cost is low. */
2332 /* Since the dispersion table has been scaled down a factor
2333 * 6.0 and the repulsion a factor 12.0 to compensate for the
2334 * c6/c12 parameters inside nbfp[] being scaled up (to save
2335 * flops in kernels), we need to correct for this.
2346 for (ri = rstart; ri < rend; ++ri)
2350 eb = 2.0*invscale2*r;
2354 pb = 3.0*invscale2*r;
2355 pc = 3.0*invscale*r*r;
2358 /* this "8" is from the packing in the vdwtab array - perhaps
2359 should be defined? */
2361 offset = 8*ri + offstart;
2362 y0 = vdwtab[offset];
2363 f = vdwtab[offset+1];
2364 g = vdwtab[offset+2];
2365 h = vdwtab[offset+3];
2367 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2368 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2370 *enerout = 4.0*M_PI*enersum*tabfactor;
2371 *virout = 4.0*M_PI*virsum*tabfactor;
2374 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2376 double eners[2], virs[2], enersum, virsum;
2377 double r0, rc3, rc9;
2379 real scale, *vdwtab;
2381 fr->enershiftsix = 0;
2382 fr->enershifttwelve = 0;
2383 fr->enerdiffsix = 0;
2384 fr->enerdifftwelve = 0;
2386 fr->virdifftwelve = 0;
2388 const interaction_const_t *ic = fr->ic;
2390 if (eDispCorr != edispcNO)
2392 for (i = 0; i < 2; i++)
2397 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2398 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2399 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2400 (ic->vdwtype == evdwSHIFT) ||
2401 (ic->vdwtype == evdwSWITCH))
2403 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2404 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2405 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2408 "With dispersion correction rvdw-switch can not be zero "
2409 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2412 /* TODO This code depends on the logic in tables.c that
2413 constructs the table layout, which should be made
2414 explicit in future cleanup. */
2415 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2416 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2417 scale = fr->dispersionCorrectionTable->scale;
2418 vdwtab = fr->dispersionCorrectionTable->data;
2420 /* Round the cut-offs to exact table values for precision */
2421 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2422 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2424 /* The code below has some support for handling force-switching, i.e.
2425 * when the force (instead of potential) is switched over a limited
2426 * region. This leads to a constant shift in the potential inside the
2427 * switching region, which we can handle by adding a constant energy
2428 * term in the force-switch case just like when we do potential-shift.
2430 * For now this is not enabled, but to keep the functionality in the
2431 * code we check separately for switch and shift. When we do force-switch
2432 * the shifting point is rvdw_switch, while it is the cutoff when we
2433 * have a classical potential-shift.
2435 * For a pure potential-shift the potential has a constant shift
2436 * all the way out to the cutoff, and that is it. For other forms
2437 * we need to calculate the constant shift up to the point where we
2438 * start modifying the potential.
2440 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2446 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2447 (ic->vdwtype == evdwSHIFT))
2449 /* Determine the constant energy shift below rvdw_switch.
2450 * Table has a scale factor since we have scaled it down to compensate
2451 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2453 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2454 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2456 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2458 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2459 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2462 /* Add the constant part from 0 to rvdw_switch.
2463 * This integration from 0 to rvdw_switch overcounts the number
2464 * of interactions by 1, as it also counts the self interaction.
2465 * We will correct for this later.
2467 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2468 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2470 /* Calculate the contribution in the range [r0,r1] where we
2471 * modify the potential. For a pure potential-shift modifier we will
2472 * have ri0==ri1, and there will not be any contribution here.
2474 for (i = 0; i < 2; i++)
2478 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2479 eners[i] -= enersum;
2483 /* Alright: Above we compensated by REMOVING the parts outside r0
2484 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2486 * Regardless of whether r0 is the point where we start switching,
2487 * or the cutoff where we calculated the constant shift, we include
2488 * all the parts we are missing out to infinity from r0 by
2489 * calculating the analytical dispersion correction.
2491 eners[0] += -4.0*M_PI/(3.0*rc3);
2492 eners[1] += 4.0*M_PI/(9.0*rc9);
2493 virs[0] += 8.0*M_PI/rc3;
2494 virs[1] += -16.0*M_PI/(3.0*rc9);
2496 else if (ic->vdwtype == evdwCUT ||
2497 EVDW_PME(ic->vdwtype) ||
2498 ic->vdwtype == evdwUSER)
2500 if (ic->vdwtype == evdwUSER && fplog)
2503 "WARNING: using dispersion correction with user tables\n");
2506 /* Note that with LJ-PME, the dispersion correction is multiplied
2507 * by the difference between the actual C6 and the value of C6
2508 * that would produce the combination rule.
2509 * This means the normal energy and virial difference formulas
2513 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2515 /* Contribution beyond the cut-off */
2516 eners[0] += -4.0*M_PI/(3.0*rc3);
2517 eners[1] += 4.0*M_PI/(9.0*rc9);
2518 if (ic->vdw_modifier == eintmodPOTSHIFT)
2520 /* Contribution within the cut-off */
2521 eners[0] += -4.0*M_PI/(3.0*rc3);
2522 eners[1] += 4.0*M_PI/(3.0*rc9);
2524 /* Contribution beyond the cut-off */
2525 virs[0] += 8.0*M_PI/rc3;
2526 virs[1] += -16.0*M_PI/(3.0*rc9);
2531 "Dispersion correction is not implemented for vdw-type = %s",
2532 evdw_names[ic->vdwtype]);
2535 /* When we deprecate the group kernels the code below can go too */
2536 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2538 /* Calculate self-interaction coefficient (assuming that
2539 * the reciprocal-space contribution is constant in the
2540 * region that contributes to the self-interaction).
2542 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2544 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2545 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2548 fr->enerdiffsix = eners[0];
2549 fr->enerdifftwelve = eners[1];
2550 /* The 0.5 is due to the Gromacs definition of the virial */
2551 fr->virdiffsix = 0.5*virs[0];
2552 fr->virdifftwelve = 0.5*virs[1];
2556 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2557 const matrix box, real lambda, tensor pres, tensor virial,
2558 real *prescorr, real *enercorr, real *dvdlcorr)
2560 gmx_bool bCorrAll, bCorrPres;
2561 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2571 if (ir->eDispCorr != edispcNO)
2573 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2574 ir->eDispCorr == edispcAllEnerPres);
2575 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2576 ir->eDispCorr == edispcAllEnerPres);
2578 invvol = 1/det(box);
2581 /* Only correct for the interactions with the inserted molecule */
2582 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2587 dens = fr->numAtomsForDispersionCorrection*invvol;
2588 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2591 if (ir->efep == efepNO)
2593 avcsix = fr->avcsix[0];
2594 avctwelve = fr->avctwelve[0];
2598 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2599 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2602 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2603 *enercorr += avcsix*enerdiff;
2605 if (ir->efep != efepNO)
2607 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2611 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2612 *enercorr += avctwelve*enerdiff;
2613 if (fr->efep != efepNO)
2615 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2621 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2622 if (ir->eDispCorr == edispcAllEnerPres)
2624 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2626 /* The factor 2 is because of the Gromacs virial definition */
2627 spres = -2.0*invvol*svir*PRESFAC;
2629 for (m = 0; m < DIM; m++)
2631 virial[m][m] += svir;
2632 pres[m][m] += spres;
2637 /* Can't currently control when it prints, for now, just print when degugging */
2642 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2648 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2649 *enercorr, spres, svir);
2653 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2657 if (fr->efep != efepNO)
2659 *dvdlcorr += dvdlambda;
2664 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2665 const gmx_mtop_t *mtop, rvec x[],
2671 if (bFirst && fplog)
2673 fprintf(fplog, "Removing pbc first time\n");
2678 for (const gmx_molblock_t &molb : mtop->molblock)
2680 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2681 if (moltype.atoms.nr == 1 ||
2682 (!bFirst && moltype.cgs.nr == 1))
2684 /* Just one atom or charge group in the molecule, no PBC required */
2685 as += molb.nmol*moltype.atoms.nr;
2689 mk_graph_moltype(moltype, graph);
2691 for (mol = 0; mol < molb.nmol; mol++)
2693 mk_mshift(fplog, graph, ePBC, box, x+as);
2695 shift_self(graph, box, x+as);
2696 /* The molecule is whole now.
2697 * We don't need the second mk_mshift call as in do_pbc_first,
2698 * since we no longer need this graph.
2701 as += moltype.atoms.nr;
2709 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2710 const gmx_mtop_t *mtop, rvec x[])
2712 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2715 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2716 const gmx_mtop_t *mtop, rvec x[])
2718 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2721 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2724 nth = gmx_omp_nthreads_get(emntDefault);
2726 #pragma omp parallel for num_threads(nth) schedule(static)
2727 for (t = 0; t < nth; t++)
2731 size_t natoms = x.size();
2732 size_t offset = (natoms*t )/nth;
2733 size_t len = (natoms*(t + 1))/nth - offset;
2734 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2736 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2740 // TODO This can be cleaned up a lot, and move back to runner.cpp
2741 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2742 const t_inputrec *inputrec,
2743 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2744 gmx_walltime_accounting_t walltime_accounting,
2745 nonbonded_verlet_t *nbv,
2746 const gmx_pme_t *pme,
2747 gmx_bool bWriteStat)
2749 t_nrnb *nrnb_tot = nullptr;
2751 double nbfs = 0, mflop = 0;
2752 double elapsed_time,
2753 elapsed_time_over_all_ranks,
2754 elapsed_time_over_all_threads,
2755 elapsed_time_over_all_threads_over_all_ranks;
2756 /* Control whether it is valid to print a report. Only the
2757 simulation master may print, but it should not do so if the run
2758 terminated e.g. before a scheduled reset step. This is
2759 complicated by the fact that PME ranks are unaware of the
2760 reason why they were sent a pmerecvqxFINISH. To avoid
2761 communication deadlocks, we always do the communication for the
2762 report, even if we've decided not to write the report, because
2763 how long it takes to finish the run is not important when we've
2764 decided not to report on the simulation performance.
2766 Further, we only report performance for dynamical integrators,
2767 because those are the only ones for which we plan to
2768 consider doing any optimizations. */
2769 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2771 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2773 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2774 printReport = false;
2781 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2782 cr->mpi_comm_mysim);
2790 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2791 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2795 /* reduce elapsed_time over all MPI ranks in the current simulation */
2796 MPI_Allreduce(&elapsed_time,
2797 &elapsed_time_over_all_ranks,
2798 1, MPI_DOUBLE, MPI_SUM,
2799 cr->mpi_comm_mysim);
2800 elapsed_time_over_all_ranks /= cr->nnodes;
2801 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2802 * current simulation. */
2803 MPI_Allreduce(&elapsed_time_over_all_threads,
2804 &elapsed_time_over_all_threads_over_all_ranks,
2805 1, MPI_DOUBLE, MPI_SUM,
2806 cr->mpi_comm_mysim);
2811 elapsed_time_over_all_ranks = elapsed_time;
2812 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2817 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2824 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2826 print_dd_statistics(cr, inputrec, fplog);
2829 /* TODO Move the responsibility for any scaling by thread counts
2830 * to the code that handled the thread region, so that there's a
2831 * mechanism to keep cycle counting working during the transition
2832 * to task parallelism. */
2833 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2834 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2835 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2836 auto cycle_sum(wallcycle_sum(cr, wcycle));
2840 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2841 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2842 if (pme_gpu_task_enabled(pme))
2844 pme_gpu_get_timings(pme, &pme_gpu_timings);
2846 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2847 elapsed_time_over_all_ranks,
2852 if (EI_DYNAMICS(inputrec->eI))
2854 delta_t = inputrec->delta_t;
2859 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2860 elapsed_time_over_all_ranks,
2861 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2862 delta_t, nbfs, mflop);
2866 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2867 elapsed_time_over_all_ranks,
2868 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2869 delta_t, nbfs, mflop);
2874 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2876 /* this function works, but could probably use a logic rewrite to keep all the different
2877 types of efep straight. */
2879 if ((ir->efep == efepNO) && (!ir->bSimTemp))
2884 t_lambda *fep = ir->fepvals;
2885 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2886 if checkpoint is set -- a kludge is in for now
2889 for (int i = 0; i < efptNR; i++)
2891 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2892 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2894 lambda[i] = fep->init_lambda;
2897 lam0[i] = lambda[i];
2902 lambda[i] = fep->all_lambda[i][*fep_state];
2905 lam0[i] = lambda[i];
2911 /* need to rescale control temperatures to match current state */
2912 for (int i = 0; i < ir->opts.ngtc; i++)
2914 if (ir->opts.ref_t[i] > 0)
2916 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2921 /* Send to the log the information on the current lambdas */
2922 if (fplog != nullptr)
2924 fprintf(fplog, "Initial vector of lambda components:[ ");
2925 for (const auto &l : lambda)
2927 fprintf(fplog, "%10.4f ", l);
2929 fprintf(fplog, "]\n");
2934 void init_md(FILE *fplog,
2935 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2936 t_inputrec *ir, const gmx_output_env_t *oenv,
2937 const MdrunOptions &mdrunOptions,
2938 double *t, double *t0,
2939 t_state *globalState, double *lam0,
2940 t_nrnb *nrnb, gmx_mtop_t *mtop,
2942 gmx::BoxDeformation *deform,
2943 int nfile, const t_filenm fnm[],
2944 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2945 tensor force_vir, tensor shake_vir,
2946 tensor total_vir, tensor pres, rvec mu_tot,
2948 gmx_wallcycle_t wcycle)
2952 /* Initial values */
2953 *t = *t0 = ir->init_t;
2956 for (i = 0; i < ir->opts.ngtc; i++)
2958 /* set bSimAnn if any group is being annealed */
2959 if (ir->opts.annealing[i] != eannNO)
2965 /* Initialize lambda variables */
2966 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2967 * We currently need to call initialize_lambdas on non-master ranks
2968 * to initialize lam0.
2972 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2977 std::array<real, efptNR> tmpLambda;
2978 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2981 // TODO upd is never NULL in practice, but the analysers don't know that
2984 *upd = init_update(ir, deform);
2988 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2991 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2993 if (ir->etc == etcBERENDSEN)
2995 please_cite(fplog, "Berendsen84a");
2997 if (ir->etc == etcVRESCALE)
2999 please_cite(fplog, "Bussi2007a");
3001 if (ir->eI == eiSD1)
3003 please_cite(fplog, "Goga2012");
3010 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3012 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3013 mtop, ir, mdoutf_get_fp_dhdl(*outf));
3016 /* Initiate variables */
3017 clear_mat(force_vir);
3018 clear_mat(shake_vir);
3020 clear_mat(total_vir);
3024 void init_rerun(FILE *fplog,
3025 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
3026 t_inputrec *ir, const gmx_output_env_t *oenv,
3027 const MdrunOptions &mdrunOptions,
3028 t_state *globalState, double *lam0,
3029 t_nrnb *nrnb, gmx_mtop_t *mtop,
3030 int nfile, const t_filenm fnm[],
3031 gmx_mdoutf_t *outf, t_mdebin **mdebin,
3032 gmx_wallcycle_t wcycle)
3034 /* Initialize lambda variables */
3035 /* TODO: Clean up initialization of fep_state and lambda in t_state.
3036 * We currently need to call initialize_lambdas on non-master ranks
3037 * to initialize lam0.
3041 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
3046 std::array<real, efptNR> tmpLambda;
3047 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
3054 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
3055 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
3056 mtop, ir, mdoutf_get_fp_dhdl(*outf), true);