2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/awh/awh.h"
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/imd/imd.h"
65 #include "gromacs/listed-forces/bonded.h"
66 #include "gromacs/listed-forces/disre.h"
67 #include "gromacs/listed-forces/orires.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/units.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/math/vecdump.h"
72 #include "gromacs/mdlib/calcmu.h"
73 #include "gromacs/mdlib/calcvir.h"
74 #include "gromacs/mdlib/constr.h"
75 #include "gromacs/mdlib/force.h"
76 #include "gromacs/mdlib/forcerec.h"
77 #include "gromacs/mdlib/gmx_omp_nthreads.h"
78 #include "gromacs/mdlib/mdrun.h"
79 #include "gromacs/mdlib/nb_verlet.h"
80 #include "gromacs/mdlib/nbnxn_atomdata.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_grid.h"
83 #include "gromacs/mdlib/nbnxn_search.h"
84 #include "gromacs/mdlib/qmmm.h"
85 #include "gromacs/mdlib/update.h"
86 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
87 #include "gromacs/mdtypes/commrec.h"
88 #include "gromacs/mdtypes/forceoutput.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/state.h"
93 #include "gromacs/pbcutil/ishift.h"
94 #include "gromacs/pbcutil/mshift.h"
95 #include "gromacs/pbcutil/pbc.h"
96 #include "gromacs/pulling/pull.h"
97 #include "gromacs/pulling/pull_rotation.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/gpu_timing.h"
100 #include "gromacs/timing/wallcycle.h"
101 #include "gromacs/timing/wallcyclereporting.h"
102 #include "gromacs/timing/walltime_accounting.h"
103 #include "gromacs/topology/topology.h"
104 #include "gromacs/utility/arrayref.h"
105 #include "gromacs/utility/basedefinitions.h"
106 #include "gromacs/utility/cstringutil.h"
107 #include "gromacs/utility/exceptions.h"
108 #include "gromacs/utility/fatalerror.h"
109 #include "gromacs/utility/gmxassert.h"
110 #include "gromacs/utility/gmxmpi.h"
111 #include "gromacs/utility/logger.h"
112 #include "gromacs/utility/pleasecite.h"
113 #include "gromacs/utility/smalloc.h"
114 #include "gromacs/utility/sysinfo.h"
116 #include "nbnxn_gpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
118 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
120 // TODO: this environment variable allows us to verify before release
121 // that on less common architectures the total cost of polling is not larger than
122 // a blocking wait (so polling does not introduce overhead when the static
123 // PME-first ordering would suffice).
124 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
127 void print_time(FILE *out,
128 gmx_walltime_accounting_t walltime_accounting,
134 char timebuf[STRLEN];
135 double dt, elapsed_seconds, time_per_step;
144 fprintf(out, "step %s", gmx_step_str(step, buf));
147 if ((step >= ir->nstlist))
149 double seconds_since_epoch = gmx_gettime();
150 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
151 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
152 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
158 finish = (time_t) (seconds_since_epoch + dt);
159 gmx_ctime_r(&finish, timebuf, STRLEN);
160 sprintf(buf, "%s", timebuf);
161 buf[strlen(buf)-1] = '\0';
162 fprintf(out, ", will finish %s", buf);
166 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
171 fprintf(out, " performance: %.1f ns/day ",
172 ir->delta_t/1000*24*60*60/time_per_step);
181 GMX_UNUSED_VALUE(cr);
187 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
190 char time_string[STRLEN];
199 char timebuf[STRLEN];
200 time_t temp_time = (time_t) the_time;
202 gmx_ctime_r(&temp_time, timebuf, STRLEN);
203 for (i = 0; timebuf[i] >= ' '; i++)
205 time_string[i] = timebuf[i];
207 time_string[i] = '\0';
210 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
213 void print_start(FILE *fplog, const t_commrec *cr,
214 gmx_walltime_accounting_t walltime_accounting,
219 sprintf(buf, "Started %s", name);
220 print_date_and_time(fplog, cr->nodeid, buf,
221 walltime_accounting_get_start_time_stamp(walltime_accounting));
224 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
226 const int end = forceToAdd.size();
228 // cppcheck-suppress unreadVariable
229 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
230 #pragma omp parallel for num_threads(nt) schedule(static)
231 for (int i = 0; i < end; i++)
233 rvec_inc(f[i], forceToAdd[i]);
237 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
238 ForceWithVirial *forceWithVirial,
239 ArrayRef<const gmx::RVec> pmeForces,
240 gmx_enerdata_t *enerd,
244 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
245 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
246 forceWithVirial->addVirialContribution(vir_Q);
247 enerd->term[F_COUL_RECIP] += Vlr_q;
248 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
249 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
252 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
253 tensor vir_part, t_graph *graph, matrix box,
254 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
258 /* The short-range virial from surrounding boxes */
259 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
260 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
262 /* Calculate partial virial, for local atoms only, based on short range.
263 * Total virial is computed in global_stat, called from do_md
265 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
266 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
268 /* Add wall contribution */
269 for (i = 0; i < DIM; i++)
271 vir_part[i][ZZ] += fr->vir_wall_z[i];
276 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
280 static void pull_potential_wrapper(const t_commrec *cr,
281 const t_inputrec *ir,
282 matrix box, ArrayRef<const RVec> x,
283 ForceWithVirial *force,
284 const t_mdatoms *mdatoms,
285 gmx_enerdata_t *enerd,
288 gmx_wallcycle_t wcycle)
293 /* Calculate the center of mass forces, this requires communication,
294 * which is why pull_potential is called close to other communication.
296 wallcycle_start(wcycle, ewcPULLPOT);
297 set_pbc(&pbc, ir->ePBC, box);
299 enerd->term[F_COM_PULL] +=
300 pull_potential(ir->pull_work, mdatoms, &pbc,
301 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
302 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
303 wallcycle_stop(wcycle, ewcPULLPOT);
306 static void pme_receive_force_ener(const t_commrec *cr,
307 ForceWithVirial *forceWithVirial,
308 gmx_enerdata_t *enerd,
309 gmx_wallcycle_t wcycle)
311 real e_q, e_lj, dvdl_q, dvdl_lj;
312 float cycles_ppdpme, cycles_seppme;
314 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
315 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
317 /* In case of node-splitting, the PP nodes receive the long-range
318 * forces, virial and energy from the PME nodes here.
320 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
323 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
325 enerd->term[F_COUL_RECIP] += e_q;
326 enerd->term[F_LJ_RECIP] += e_lj;
327 enerd->dvdl_lin[efptCOUL] += dvdl_q;
328 enerd->dvdl_lin[efptVDW] += dvdl_lj;
332 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
334 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
337 static void print_large_forces(FILE *fp,
345 real force2Tolerance = gmx::square(forceTolerance);
346 std::uintmax_t numNonFinite = 0;
347 for (int i = 0; i < md->homenr; i++)
349 real force2 = norm2(f[i]);
350 bool nonFinite = !std::isfinite(force2);
351 if (force2 >= force2Tolerance || nonFinite)
353 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
355 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
362 if (numNonFinite > 0)
364 /* Note that with MPI this fatal call on one rank might interrupt
365 * the printing on other ranks. But we can only avoid that with
366 * an expensive MPI barrier that we would need at each step.
368 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
372 static void post_process_forces(const t_commrec *cr,
375 gmx_wallcycle_t wcycle,
376 const gmx_localtop_t *top,
380 ForceWithVirial *forceWithVirial,
382 const t_mdatoms *mdatoms,
385 const gmx_vsite_t *vsite,
388 if (fr->haveDirectVirialContributions)
390 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
394 /* Spread the mesh force on virtual sites to the other particles...
395 * This is parallellized. MPI communication is performed
396 * if the constructing atoms aren't local.
398 matrix virial = { { 0 } };
399 spread_vsite_f(vsite, x, fDirectVir, nullptr,
400 (flags & GMX_FORCE_VIRIAL), virial,
402 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
403 forceWithVirial->addVirialContribution(virial);
406 if (flags & GMX_FORCE_VIRIAL)
408 /* Now add the forces, this is local */
409 sum_forces(f, forceWithVirial->force_);
411 /* Add the direct virial contributions */
412 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
413 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
417 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
422 if (fr->print_force >= 0)
424 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
428 static void do_nb_verlet(t_forcerec *fr,
429 interaction_const_t *ic,
430 gmx_enerdata_t *enerd,
431 int flags, int ilocality,
435 gmx_wallcycle_t wcycle)
437 if (!(flags & GMX_FORCE_NONBONDED))
439 /* skip non-bonded calculation */
443 nonbonded_verlet_t *nbv = fr->nbv;
444 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
446 /* GPU kernel launch overhead is already timed separately */
447 if (fr->cutoff_scheme != ecutsVERLET)
449 gmx_incons("Invalid cut-off scheme passed!");
452 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
454 if (!bUsingGpuKernels)
456 /* When dynamic pair-list pruning is requested, we need to prune
457 * at nstlistPrune steps.
459 if (nbv->listParams->useDynamicPruning &&
460 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
462 /* Prune the pair-list beyond fr->ic->rlistPrune using
463 * the current coordinates of the atoms.
465 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
466 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
467 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
470 wallcycle_sub_start(wcycle, ewcsNONBONDED);
473 switch (nbvg->kernel_type)
475 case nbnxnk4x4_PlainC:
476 case nbnxnk4xN_SIMD_4xN:
477 case nbnxnk4xN_SIMD_2xNN:
478 nbnxn_kernel_cpu(nbvg,
485 enerd->grpp.ener[egCOULSR],
487 enerd->grpp.ener[egBHAMSR] :
488 enerd->grpp.ener[egLJSR]);
491 case nbnxnk8x8x8_GPU:
492 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
495 case nbnxnk8x8x8_PlainC:
496 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
503 enerd->grpp.ener[egCOULSR],
505 enerd->grpp.ener[egBHAMSR] :
506 enerd->grpp.ener[egLJSR]);
510 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
513 if (!bUsingGpuKernels)
515 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
518 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
519 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
521 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
523 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
524 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
526 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
530 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
532 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
533 if (flags & GMX_FORCE_ENERGY)
535 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
536 enr_nbnxn_kernel_ljc += 1;
537 enr_nbnxn_kernel_lj += 1;
540 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
541 nbvg->nbl_lists.natpair_ljq);
542 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
543 nbvg->nbl_lists.natpair_lj);
544 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
545 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
546 nbvg->nbl_lists.natpair_q);
548 if (ic->vdw_modifier == eintmodFORCESWITCH)
550 /* We add up the switch cost separately */
551 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
552 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
554 if (ic->vdw_modifier == eintmodPOTSWITCH)
556 /* We add up the switch cost separately */
557 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
558 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
560 if (ic->vdwtype == evdwPME)
562 /* We add up the LJ Ewald cost separately */
563 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
564 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
568 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
572 const t_mdatoms *mdatoms,
575 gmx_enerdata_t *enerd,
578 gmx_wallcycle_t wcycle)
581 nb_kernel_data_t kernel_data;
583 real dvdl_nb[efptNR];
588 /* Add short-range interactions */
589 donb_flags |= GMX_NONBONDED_DO_SR;
591 /* Currently all group scheme kernels always calculate (shift-)forces */
592 if (flags & GMX_FORCE_FORCES)
594 donb_flags |= GMX_NONBONDED_DO_FORCE;
596 if (flags & GMX_FORCE_VIRIAL)
598 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
600 if (flags & GMX_FORCE_ENERGY)
602 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
605 kernel_data.flags = donb_flags;
606 kernel_data.lambda = lambda;
607 kernel_data.dvdl = dvdl_nb;
609 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
610 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
612 /* reset free energy components */
613 for (i = 0; i < efptNR; i++)
618 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
620 wallcycle_sub_start(wcycle, ewcsNONBONDED);
621 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
622 for (th = 0; th < nbl_lists->nnbl; th++)
626 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
627 x, f, fr, mdatoms, &kernel_data, nrnb);
629 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
632 if (fepvals->sc_alpha != 0)
634 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
635 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
639 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
640 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
643 /* If we do foreign lambda and we have soft-core interactions
644 * we have to recalculate the (non-linear) energies contributions.
646 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
648 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
649 kernel_data.lambda = lam_i;
650 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
651 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
652 /* Note that we add to kernel_data.dvdl, but ignore the result */
654 for (i = 0; i < enerd->n_lambda; i++)
656 for (j = 0; j < efptNR; j++)
658 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
660 reset_foreign_enerdata(enerd);
661 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
662 for (th = 0; th < nbl_lists->nnbl; th++)
666 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
667 x, f, fr, mdatoms, &kernel_data, nrnb);
669 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
672 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
673 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
677 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
680 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
682 return nbv != nullptr && nbv->bUseGPU;
685 static inline void clear_rvecs_omp(int n, rvec v[])
687 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
689 /* Note that we would like to avoid this conditional by putting it
690 * into the omp pragma instead, but then we still take the full
691 * omp parallel for overhead (at least with gcc5).
695 for (int i = 0; i < n; i++)
702 #pragma omp parallel for num_threads(nth) schedule(static)
703 for (int i = 0; i < n; i++)
710 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
712 * \param groupOptions Group options, containing T-coupling options
714 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
716 real nrdfCoupled = 0;
717 real nrdfUncoupled = 0;
718 real kineticEnergy = 0;
719 for (int g = 0; g < groupOptions.ngtc; g++)
721 if (groupOptions.tau_t[g] >= 0)
723 nrdfCoupled += groupOptions.nrdf[g];
724 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
728 nrdfUncoupled += groupOptions.nrdf[g];
732 /* This conditional with > also catches nrdf=0 */
733 if (nrdfCoupled > nrdfUncoupled)
735 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
743 /*! \brief This routine checks that the potential energy is finite.
745 * Always checks that the potential energy is finite. If step equals
746 * inputrec.init_step also checks that the magnitude of the potential energy
747 * is reasonable. Terminates with a fatal error when a check fails.
748 * Note that passing this check does not guarantee finite forces,
749 * since those use slightly different arithmetics. But in most cases
750 * there is just a narrow coordinate range where forces are not finite
751 * and energies are finite.
753 * \param[in] step The step number, used for checking and printing
754 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
755 * \param[in] inputrec The input record
757 static void checkPotentialEnergyValidity(gmx_int64_t step,
758 const gmx_enerdata_t &enerd,
759 const t_inputrec &inputrec)
761 /* Threshold valid for comparing absolute potential energy against
762 * the kinetic energy. Normally one should not consider absolute
763 * potential energy values, but with a factor of one million
764 * we should never get false positives.
766 constexpr real c_thresholdFactor = 1e6;
768 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
769 real averageKineticEnergy = 0;
770 /* We only check for large potential energy at the initial step,
771 * because that is by far the most likely step for this too occur
772 * and because computing the average kinetic energy is not free.
773 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
774 * before they become NaN.
776 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
778 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
781 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
782 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
784 gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
787 energyIsNotFinite ? "not finite" : "extremely high",
789 enerd.term[F_COUL_SR],
790 energyIsNotFinite ? "non-finite" : "very high",
791 energyIsNotFinite ? " or Nan" : "");
795 /*! \brief Compute forces and/or energies for special algorithms
797 * The intention is to collect all calls to algorithms that compute
798 * forces on local atoms only and that do not contribute to the local
799 * virial sum (but add their virial contribution separately).
800 * Eventually these should likely all become ForceProviders.
801 * Within this function the intention is to have algorithms that do
802 * global communication at the end, so global barriers within the MD loop
803 * are as close together as possible.
805 * \param[in] fplog The log file
806 * \param[in] cr The communication record
807 * \param[in] inputrec The input record
808 * \param[in] awh The Awh module (nullptr if none in use).
809 * \param[in] step The current MD step
810 * \param[in] t The current time
811 * \param[in,out] wcycle Wallcycle accounting struct
812 * \param[in,out] forceProviders Pointer to a list of force providers
813 * \param[in] box The unit cell
814 * \param[in] x The coordinates
815 * \param[in] mdatoms Per atom properties
816 * \param[in] lambda Array of free-energy lambda values
817 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
818 * \param[in,out] forceWithVirial Force and virial buffers
819 * \param[in,out] enerd Energy buffer
820 * \param[in,out] ed Essential dynamics pointer
821 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
823 * \todo Remove bNS, which is used incorrectly.
824 * \todo Convert all other algorithms called here to ForceProviders.
827 computeSpecialForces(FILE *fplog,
829 const t_inputrec *inputrec,
833 gmx_wallcycle_t wcycle,
834 ForceProviders *forceProviders,
836 ArrayRef<const RVec> x,
837 const t_mdatoms *mdatoms,
840 ForceWithVirial *forceWithVirial,
841 gmx_enerdata_t *enerd,
845 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
847 /* NOTE: Currently all ForceProviders only provide forces.
848 * When they also provide energies, remove this conditional.
852 ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
853 ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
855 /* Collect forces from modules */
856 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
859 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
861 pull_potential_wrapper(cr, inputrec, box, x,
863 mdatoms, enerd, lambda, t,
868 enerd->term[F_COM_PULL] +=
869 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
871 t, step, wcycle, fplog);
875 rvec *f = as_rvec_array(forceWithVirial->force_.data());
877 /* Add the forces from enforced rotation potentials (if any) */
880 wallcycle_start(wcycle, ewcROTadd);
881 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
882 wallcycle_stop(wcycle, ewcROTadd);
887 /* Note that since init_edsam() is called after the initialization
888 * of forcerec, edsam doesn't request the noVirSum force buffer.
889 * Thus if no other algorithm (e.g. PME) requires it, the forces
890 * here will contribute to the virial.
892 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
895 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
896 if (inputrec->bIMD && computeForces)
898 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
902 /*! \brief Launch the prepare_step and spread stages of PME GPU.
904 * \param[in] pmedata The PME structure
905 * \param[in] box The box matrix
906 * \param[in] x Coordinate array
907 * \param[in] flags Force flags
908 * \param[in] wcycle The wallcycle structure
910 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
914 gmx_wallcycle_t wcycle)
916 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
917 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
918 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
920 pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
921 pme_gpu_launch_spread(pmedata, x, wcycle);
924 /*! \brief Launch the FFT and gather stages of PME GPU
926 * This function only implements setting the output forces (no accumulation).
928 * \param[in] pmedata The PME structure
929 * \param[in] wcycle The wallcycle structure
931 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
932 gmx_wallcycle_t wcycle)
934 pme_gpu_launch_complex_transforms(pmedata, wcycle);
935 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
939 * Polling wait for either of the PME or nonbonded GPU tasks.
941 * Instead of a static order in waiting for GPU tasks, this function
942 * polls checking which of the two tasks completes first, and does the
943 * associated force buffer reduction overlapped with the other task.
944 * By doing that, unlike static scheduling order, it can always overlap
945 * one of the reductions, regardless of the GPU task completion order.
947 * \param[in] nbv Nonbonded verlet structure
948 * \param[in] pmedata PME module data
949 * \param[in,out] force Force array to reduce task outputs into.
950 * \param[in,out] forceWithVirial Force and virial buffers
951 * \param[in,out] fshift Shift force output vector results are reduced into
952 * \param[in,out] enerd Energy data structure results are reduced into
953 * \param[in] flags Force flags
954 * \param[in] wcycle The wallcycle structure
956 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
957 const gmx_pme_t *pmedata,
958 gmx::PaddedArrayRef<gmx::RVec> *force,
959 ForceWithVirial *forceWithVirial,
961 gmx_enerdata_t *enerd,
963 gmx_wallcycle_t wcycle)
965 bool isPmeGpuDone = false;
966 bool isNbGpuDone = false;
969 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
971 while (!isPmeGpuDone || !isNbGpuDone)
978 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
979 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
980 vir_Q, &Vlr_q, completionType);
984 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
985 enerd, vir_Q, Vlr_q);
991 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
992 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
993 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
995 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
996 fshift, completionType);
997 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
998 // To get the call count right, when the task finished we
999 // issue a start/stop.
1000 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1001 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1004 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1005 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1007 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1008 nbv->nbat, as_rvec_array(force->data()), wcycle);
1015 * Launch the dynamic rolling pruning GPU task.
1017 * We currently alternate local/non-local list pruning in odd-even steps
1018 * (only pruning every second step without DD).
1020 * \param[in] cr The communication record
1021 * \param[in] nbv Nonbonded verlet structure
1022 * \param[in] inputrec The input record
1023 * \param[in] step The current MD step
1025 static inline void launchGpuRollingPruning(const t_commrec *cr,
1026 const nonbonded_verlet_t *nbv,
1027 const t_inputrec *inputrec,
1028 const gmx_int64_t step)
1030 /* We should not launch the rolling pruning kernel at a search
1031 * step or just before search steps, since that's useless.
1032 * Without domain decomposition we prune at even steps.
1033 * With domain decomposition we alternate local and non-local
1034 * pruning at even and odd steps.
1036 int numRollingParts = nbv->listParams->numRollingParts;
1037 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1038 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1039 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1040 if (stepWithCurrentList > 0 &&
1041 stepWithCurrentList < inputrec->nstlist - 1 &&
1042 (stepIsEven || DOMAINDECOMP(cr)))
1044 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1045 stepIsEven ? eintLocal : eintNonlocal,
1050 static void do_force_cutsVERLET(FILE *fplog,
1051 const t_commrec *cr,
1052 const gmx_multisim_t *ms,
1053 const t_inputrec *inputrec,
1057 gmx_wallcycle_t wcycle,
1058 const gmx_localtop_t *top,
1059 const gmx_groups_t * /* groups */,
1060 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1062 gmx::PaddedArrayRef<gmx::RVec> force,
1064 const t_mdatoms *mdatoms,
1065 gmx_enerdata_t *enerd, t_fcdata *fcd,
1069 interaction_const_t *ic,
1070 const gmx_vsite_t *vsite,
1073 const gmx_edsam *ed,
1075 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1076 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1080 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1081 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1082 rvec vzero, box_diag;
1083 float cycles_pme, cycles_wait_gpu;
1084 nonbonded_verlet_t *nbv = fr->nbv;
1086 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1087 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1088 bFillGrid = (bNS && bStateChanged);
1089 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1090 bDoForces = (flags & GMX_FORCE_FORCES);
1091 bUseGPU = fr->nbv->bUseGPU;
1092 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1094 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1095 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1096 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1097 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1099 /* At a search step we need to start the first balancing region
1100 * somewhere early inside the step after communication during domain
1101 * decomposition (and not during the previous step as usual).
1104 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1106 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1109 cycles_wait_gpu = 0;
1111 const int start = 0;
1112 const int homenr = mdatoms->homenr;
1114 clear_mat(vir_force);
1116 if (DOMAINDECOMP(cr))
1118 cg1 = cr->dd->globalAtomGroupIndices.size();
1131 update_forcerec(fr, box);
1133 if (inputrecNeedMutot(inputrec))
1135 /* Calculate total (local) dipole moment in a temporary common array.
1136 * This makes it possible to sum them over nodes faster.
1138 calc_mu(start, homenr,
1139 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1144 if (fr->ePBC != epbcNONE)
1146 /* Compute shift vectors every step,
1147 * because of pressure coupling or box deformation!
1149 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1151 calc_shifts(box, fr->shift_vec);
1156 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1157 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1159 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1161 unshift_self(graph, box, as_rvec_array(x.data()));
1165 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1166 fr->shift_vec, nbv->nbat);
1169 if (!thisRankHasDuty(cr, DUTY_PME))
1171 /* Send particle coordinates to the pme nodes.
1172 * Since this is only implemented for domain decomposition
1173 * and domain decomposition does not use the graph,
1174 * we do not need to worry about shifting.
1176 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1177 lambda[efptCOUL], lambda[efptVDW],
1178 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1181 #endif /* GMX_MPI */
1185 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1188 /* do gridding for pair search */
1191 if (graph && bStateChanged)
1193 /* Calculate intramolecular shift vectors to make molecules whole */
1194 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1198 box_diag[XX] = box[XX][XX];
1199 box_diag[YY] = box[YY][YY];
1200 box_diag[ZZ] = box[ZZ][ZZ];
1202 wallcycle_start(wcycle, ewcNS);
1203 if (!DOMAINDECOMP(cr))
1205 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1206 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1208 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1210 nbv->grp[eintLocal].kernel_type,
1212 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1216 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1217 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1218 fr->cginfo, as_rvec_array(x.data()),
1219 nbv->grp[eintNonlocal].kernel_type,
1221 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1224 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1225 wallcycle_stop(wcycle, ewcNS);
1228 /* initialize the GPU atom data and copy shift vector */
1231 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1232 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1236 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1239 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1241 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1242 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1245 /* do local pair search */
1248 wallcycle_start_nocount(wcycle, ewcNS);
1249 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1250 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1252 nbv->listParams->rlistOuter,
1253 nbv->min_ci_balanced,
1254 &nbv->grp[eintLocal].nbl_lists,
1256 nbv->grp[eintLocal].kernel_type,
1258 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1259 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1261 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1263 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1267 /* initialize local pair-list on the GPU */
1268 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1269 nbv->grp[eintLocal].nbl_lists.nbl[0],
1272 wallcycle_stop(wcycle, ewcNS);
1276 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1282 if (DOMAINDECOMP(cr))
1284 ddOpenBalanceRegionGpu(cr->dd);
1287 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1288 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1289 /* launch local nonbonded work on GPU */
1290 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1291 step, nrnb, wcycle);
1292 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1293 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1298 // In PME GPU and mixed mode we launch FFT / gather after the
1299 // X copy/transform to allow overlap as well as after the GPU NB
1300 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1301 // the nonbonded kernel.
1302 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1305 /* Communicate coordinates and sum dipole if necessary +
1306 do non-local pair search */
1307 if (DOMAINDECOMP(cr))
1311 wallcycle_start_nocount(wcycle, ewcNS);
1312 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1314 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1316 nbv->listParams->rlistOuter,
1317 nbv->min_ci_balanced,
1318 &nbv->grp[eintNonlocal].nbl_lists,
1320 nbv->grp[eintNonlocal].kernel_type,
1322 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1323 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1325 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1327 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1329 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1331 /* initialize non-local pair-list on the GPU */
1332 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1333 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1336 wallcycle_stop(wcycle, ewcNS);
1340 dd_move_x(cr->dd, box, x, wcycle);
1342 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1348 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1349 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1350 /* launch non-local nonbonded tasks on GPU */
1351 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1352 step, nrnb, wcycle);
1353 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1354 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1360 /* launch D2H copy-back F */
1361 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1362 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1363 if (DOMAINDECOMP(cr))
1365 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1366 flags, eatNonlocal);
1368 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1370 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1371 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1374 if (bStateChanged && inputrecNeedMutot(inputrec))
1378 gmx_sumd(2*DIM, mu, cr);
1379 ddReopenBalanceRegionCpu(cr->dd);
1382 for (i = 0; i < 2; i++)
1384 for (j = 0; j < DIM; j++)
1386 fr->mu_tot[i][j] = mu[i*DIM + j];
1390 if (fr->efep == efepNO)
1392 copy_rvec(fr->mu_tot[0], mu_tot);
1396 for (j = 0; j < DIM; j++)
1399 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1400 lambda[efptCOUL]*fr->mu_tot[1][j];
1404 /* Reset energies */
1405 reset_enerdata(enerd);
1406 clear_rvecs(SHIFTS, fr->fshift);
1408 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1410 wallcycle_start(wcycle, ewcPPDURINGPME);
1411 dd_force_flop_start(cr->dd, nrnb);
1416 wallcycle_start(wcycle, ewcROT);
1417 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1418 wallcycle_stop(wcycle, ewcROT);
1421 /* Temporary solution until all routines take PaddedRVecVector */
1422 rvec *f = as_rvec_array(force.data());
1424 /* Start the force cycle counter.
1425 * Note that a different counter is used for dynamic load balancing.
1427 wallcycle_start(wcycle, ewcFORCE);
1429 gmx::ArrayRef<gmx::RVec> forceRef = force;
1432 /* If we need to compute the virial, we might need a separate
1433 * force buffer for algorithms for which the virial is calculated
1434 * directly, such as PME.
1436 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1438 forceRef = *fr->forceBufferForDirectVirialContributions;
1440 /* We only compute forces on local atoms. Note that vsites can
1441 * spread to non-local atoms, but that part of the buffer is
1442 * cleared separately in the vsite spreading code.
1444 clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
1446 /* Clear the short- and long-range forces */
1447 clear_rvecs_omp(fr->natoms_force_constr, f);
1450 /* forceWithVirial uses the local atom range only */
1451 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1453 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1455 clear_pull_forces(inputrec->pull_work);
1458 /* We calculate the non-bonded forces, when done on the CPU, here.
1459 * We do this before calling do_force_lowlevel, because in that
1460 * function, the listed forces are calculated before PME, which
1461 * does communication. With this order, non-bonded and listed
1462 * force calculation imbalance can be balanced out by the domain
1463 * decomposition load balancing.
1468 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1469 step, nrnb, wcycle);
1472 if (fr->efep != efepNO)
1474 /* Calculate the local and non-local free energy interactions here.
1475 * Happens here on the CPU both with and without GPU.
1477 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1479 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1480 fr, as_rvec_array(x.data()), f, mdatoms,
1481 inputrec->fepvals, lambda,
1482 enerd, flags, nrnb, wcycle);
1485 if (DOMAINDECOMP(cr) &&
1486 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1488 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1489 fr, as_rvec_array(x.data()), f, mdatoms,
1490 inputrec->fepvals, lambda,
1491 enerd, flags, nrnb, wcycle);
1499 if (DOMAINDECOMP(cr))
1501 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1502 step, nrnb, wcycle);
1511 aloc = eintNonlocal;
1514 /* Add all the non-bonded force to the normal force array.
1515 * This can be split into a local and a non-local part when overlapping
1516 * communication with calculation with domain decomposition.
1518 wallcycle_stop(wcycle, ewcFORCE);
1520 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1522 wallcycle_start_nocount(wcycle, ewcFORCE);
1524 /* if there are multiple fshift output buffers reduce them */
1525 if ((flags & GMX_FORCE_VIRIAL) &&
1526 nbv->grp[aloc].nbl_lists.nnbl > 1)
1528 /* This is not in a subcounter because it takes a
1529 negligible and constant-sized amount of time */
1530 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1535 /* update QMMMrec, if necessary */
1538 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1541 /* Compute the bonded and non-bonded energies and optionally forces */
1542 do_force_lowlevel(fr, inputrec, &(top->idef),
1543 cr, ms, nrnb, wcycle, mdatoms,
1544 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1545 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1546 flags, &cycles_pme);
1548 wallcycle_stop(wcycle, ewcFORCE);
1550 computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
1551 fr->forceProviders, box, x, mdatoms, lambda,
1552 flags, &forceWithVirial, enerd,
1557 /* wait for non-local forces (or calculate in emulation mode) */
1558 if (DOMAINDECOMP(cr))
1562 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1563 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1565 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1567 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1571 wallcycle_start_nocount(wcycle, ewcFORCE);
1572 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1573 step, nrnb, wcycle);
1574 wallcycle_stop(wcycle, ewcFORCE);
1577 /* skip the reduction if there was no non-local work to do */
1578 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1580 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1581 nbv->nbat, f, wcycle);
1586 if (DOMAINDECOMP(cr))
1588 /* We are done with the CPU compute.
1589 * We will now communicate the non-local forces.
1590 * If we use a GPU this will overlap with GPU work, so in that case
1591 * we do not close the DD force balancing region here.
1593 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1595 ddCloseBalanceRegionCpu(cr->dd);
1599 dd_move_f(cr->dd, force, fr->fshift, wcycle);
1603 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1604 // an alternating wait/reduction scheme.
1605 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1606 if (alternateGpuWait)
1608 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1611 if (!alternateGpuWait && useGpuPme)
1613 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1616 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1617 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1620 /* Wait for local GPU NB outputs on the non-alternating wait path */
1621 if (!alternateGpuWait && bUseGPU)
1623 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1624 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1625 * but even with a step of 0.1 ms the difference is less than 1%
1628 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1630 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1631 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1633 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1635 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1637 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1639 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1640 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1642 /* We measured few cycles, it could be that the kernel
1643 * and transfer finished earlier and there was no actual
1644 * wait time, only API call overhead.
1645 * Then the actual time could be anywhere between 0 and
1646 * cycles_wait_est. We will use half of cycles_wait_est.
1648 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1650 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1654 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1656 // NOTE: emulation kernel is not included in the balancing region,
1657 // but emulation mode does not target performance anyway
1658 wallcycle_start_nocount(wcycle, ewcFORCE);
1659 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1660 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1661 step, nrnb, wcycle);
1662 wallcycle_stop(wcycle, ewcFORCE);
1667 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1672 /* now clear the GPU outputs while we finish the step on the CPU */
1673 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1674 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1675 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1677 /* Is dynamic pair-list pruning activated? */
1678 if (nbv->listParams->useDynamicPruning)
1680 launchGpuRollingPruning(cr, nbv, inputrec, step);
1682 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1683 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1686 /* Do the nonbonded GPU (or emulation) force buffer reduction
1687 * on the non-alternating path. */
1688 if (bUseOrEmulGPU && !alternateGpuWait)
1690 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1691 nbv->nbat, f, wcycle);
1694 if (DOMAINDECOMP(cr))
1696 dd_force_flop_stop(cr->dd, nrnb);
1701 /* If we have NoVirSum forces, but we do not calculate the virial,
1702 * we sum fr->f_novirsum=f later.
1704 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1706 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1707 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1710 if (flags & GMX_FORCE_VIRIAL)
1712 /* Calculation of the virial must be done after vsites! */
1713 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1714 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1718 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1720 /* In case of node-splitting, the PP nodes receive the long-range
1721 * forces, virial and energy from the PME nodes here.
1723 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1728 post_process_forces(cr, step, nrnb, wcycle,
1729 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1730 vir_force, mdatoms, graph, fr, vsite,
1734 if (flags & GMX_FORCE_ENERGY)
1736 /* Sum the potential energy terms from group contributions */
1737 sum_epot(&(enerd->grpp), enerd->term);
1739 if (!EI_TPI(inputrec->eI))
1741 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1746 static void do_force_cutsGROUP(FILE *fplog,
1747 const t_commrec *cr,
1748 const gmx_multisim_t *ms,
1749 const t_inputrec *inputrec,
1753 gmx_wallcycle_t wcycle,
1754 gmx_localtop_t *top,
1755 const gmx_groups_t *groups,
1756 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1758 gmx::PaddedArrayRef<gmx::RVec> force,
1760 const t_mdatoms *mdatoms,
1761 gmx_enerdata_t *enerd,
1766 const gmx_vsite_t *vsite,
1769 const gmx_edsam *ed,
1771 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1772 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1776 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1780 const int start = 0;
1781 const int homenr = mdatoms->homenr;
1783 clear_mat(vir_force);
1786 if (DOMAINDECOMP(cr))
1788 cg1 = cr->dd->globalAtomGroupIndices.size();
1799 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1800 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1801 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1802 bFillGrid = (bNS && bStateChanged);
1803 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1804 bDoForces = (flags & GMX_FORCE_FORCES);
1808 update_forcerec(fr, box);
1810 if (inputrecNeedMutot(inputrec))
1812 /* Calculate total (local) dipole moment in a temporary common array.
1813 * This makes it possible to sum them over nodes faster.
1815 calc_mu(start, homenr,
1816 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1821 if (fr->ePBC != epbcNONE)
1823 /* Compute shift vectors every step,
1824 * because of pressure coupling or box deformation!
1826 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1828 calc_shifts(box, fr->shift_vec);
1833 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1834 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1835 inc_nrnb(nrnb, eNR_CGCM, homenr);
1836 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1838 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1840 unshift_self(graph, box, as_rvec_array(x.data()));
1845 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1846 inc_nrnb(nrnb, eNR_CGCM, homenr);
1849 if (bCalcCGCM && gmx_debug_at)
1851 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1855 if (!thisRankHasDuty(cr, DUTY_PME))
1857 /* Send particle coordinates to the pme nodes.
1858 * Since this is only implemented for domain decomposition
1859 * and domain decomposition does not use the graph,
1860 * we do not need to worry about shifting.
1862 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1863 lambda[efptCOUL], lambda[efptVDW],
1864 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1867 #endif /* GMX_MPI */
1869 /* Communicate coordinates and sum dipole if necessary */
1870 if (DOMAINDECOMP(cr))
1872 dd_move_x(cr->dd, box, x, wcycle);
1874 /* No GPU support, no move_x overlap, so reopen the balance region here */
1875 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1877 ddReopenBalanceRegionCpu(cr->dd);
1881 if (inputrecNeedMutot(inputrec))
1887 gmx_sumd(2*DIM, mu, cr);
1888 ddReopenBalanceRegionCpu(cr->dd);
1890 for (i = 0; i < 2; i++)
1892 for (j = 0; j < DIM; j++)
1894 fr->mu_tot[i][j] = mu[i*DIM + j];
1898 if (fr->efep == efepNO)
1900 copy_rvec(fr->mu_tot[0], mu_tot);
1904 for (j = 0; j < DIM; j++)
1907 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1912 /* Reset energies */
1913 reset_enerdata(enerd);
1914 clear_rvecs(SHIFTS, fr->fshift);
1918 wallcycle_start(wcycle, ewcNS);
1920 if (graph && bStateChanged)
1922 /* Calculate intramolecular shift vectors to make molecules whole */
1923 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1926 /* Do the actual neighbour searching */
1928 groups, top, mdatoms,
1929 cr, nrnb, bFillGrid);
1931 wallcycle_stop(wcycle, ewcNS);
1934 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1936 wallcycle_start(wcycle, ewcPPDURINGPME);
1937 dd_force_flop_start(cr->dd, nrnb);
1942 wallcycle_start(wcycle, ewcROT);
1943 do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
1944 wallcycle_stop(wcycle, ewcROT);
1947 /* Temporary solution until all routines take PaddedRVecVector */
1948 rvec *f = as_rvec_array(force.data());
1950 /* Start the force cycle counter.
1951 * Note that a different counter is used for dynamic load balancing.
1953 wallcycle_start(wcycle, ewcFORCE);
1955 gmx::ArrayRef<gmx::RVec> forceRef = force;
1958 /* If we need to compute the virial, we might need a separate
1959 * force buffer for algorithms for which the virial is calculated
1960 * separately, such as PME.
1962 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1964 forceRef = *fr->forceBufferForDirectVirialContributions;
1966 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1969 /* Clear the short- and long-range forces */
1970 clear_rvecs(fr->natoms_force_constr, f);
1973 /* forceWithVirial might need the full force atom range */
1974 ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1976 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1978 clear_pull_forces(inputrec->pull_work);
1981 /* update QMMMrec, if necessary */
1984 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1987 /* Compute the bonded and non-bonded energies and optionally forces */
1988 do_force_lowlevel(fr, inputrec, &(top->idef),
1989 cr, ms, nrnb, wcycle, mdatoms,
1990 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1991 box, inputrec->fepvals, lambda,
1992 graph, &(top->excls), fr->mu_tot,
1996 wallcycle_stop(wcycle, ewcFORCE);
1998 if (DOMAINDECOMP(cr))
2000 dd_force_flop_stop(cr->dd, nrnb);
2002 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2004 ddCloseBalanceRegionCpu(cr->dd);
2008 computeSpecialForces(fplog, cr, inputrec, awh, step, t, wcycle,
2009 fr->forceProviders, box, x, mdatoms, lambda,
2010 flags, &forceWithVirial, enerd,
2015 /* Communicate the forces */
2016 if (DOMAINDECOMP(cr))
2018 dd_move_f(cr->dd, force, fr->fshift, wcycle);
2019 /* Do we need to communicate the separate force array
2020 * for terms that do not contribute to the single sum virial?
2021 * Position restraints and electric fields do not introduce
2022 * inter-cg forces, only full electrostatics methods do.
2023 * When we do not calculate the virial, fr->f_novirsum = f,
2024 * so we have already communicated these forces.
2026 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2027 (flags & GMX_FORCE_VIRIAL))
2029 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2033 /* If we have NoVirSum forces, but we do not calculate the virial,
2034 * we sum fr->f_novirsum=f later.
2036 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2038 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2039 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2042 if (flags & GMX_FORCE_VIRIAL)
2044 /* Calculation of the virial must be done after vsites! */
2045 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2046 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2050 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2052 /* In case of node-splitting, the PP nodes receive the long-range
2053 * forces, virial and energy from the PME nodes here.
2055 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2060 post_process_forces(cr, step, nrnb, wcycle,
2061 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2062 vir_force, mdatoms, graph, fr, vsite,
2066 if (flags & GMX_FORCE_ENERGY)
2068 /* Sum the potential energy terms from group contributions */
2069 sum_epot(&(enerd->grpp), enerd->term);
2071 if (!EI_TPI(inputrec->eI))
2073 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2079 void do_force(FILE *fplog,
2080 const t_commrec *cr,
2081 const gmx_multisim_t *ms,
2082 const t_inputrec *inputrec,
2086 gmx_wallcycle_t wcycle,
2087 gmx_localtop_t *top,
2088 const gmx_groups_t *groups,
2090 gmx::PaddedArrayRef<gmx::RVec> x,
2092 gmx::PaddedArrayRef<gmx::RVec> force,
2094 const t_mdatoms *mdatoms,
2095 gmx_enerdata_t *enerd,
2097 gmx::ArrayRef<real> lambda,
2100 const gmx_vsite_t *vsite,
2103 const gmx_edsam *ed,
2105 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2106 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2108 /* modify force flag if not doing nonbonded */
2109 if (!fr->bNonbonded)
2111 flags &= ~GMX_FORCE_NONBONDED;
2114 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2115 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2117 switch (inputrec->cutoff_scheme)
2120 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2121 awh, step, nrnb, wcycle,
2128 lambda.data(), graph,
2133 ddOpenBalanceRegion,
2134 ddCloseBalanceRegion);
2137 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2138 awh, step, nrnb, wcycle,
2145 lambda.data(), graph,
2149 ddOpenBalanceRegion,
2150 ddCloseBalanceRegion);
2153 gmx_incons("Invalid cut-off scheme passed!");
2156 /* In case we don't have constraints and are using GPUs, the next balancing
2157 * region starts here.
2158 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2159 * virial calculation and COM pulling, is not thus not included in
2160 * the balance timing, which is ok as most tasks do communication.
2162 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2164 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2169 void do_constrain_first(FILE *fplog, Constraints *constr,
2170 t_inputrec *ir, t_mdatoms *md,
2173 int i, m, start, end;
2175 real dt = ir->delta_t;
2179 /* We need to allocate one element extra, since we might use
2180 * (unaligned) 4-wide SIMD loads to access rvec entries.
2182 snew(savex, state->natoms + 1);
2189 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2190 start, md->homenr, end);
2192 /* Do a first constrain to reset particles... */
2193 step = ir->init_step;
2196 char buf[STEPSTRSIZE];
2197 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2198 gmx_step_str(step, buf));
2202 /* constrain the current position */
2203 constr->apply(TRUE, FALSE,
2205 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2207 state->lambda[efptBONDED], &dvdl_dum,
2208 nullptr, nullptr, ConstraintVariable::Positions);
2211 /* constrain the inital velocity, and save it */
2212 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2213 constr->apply(TRUE, FALSE,
2215 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2217 state->lambda[efptBONDED], &dvdl_dum,
2218 nullptr, nullptr, ConstraintVariable::Velocities);
2220 /* constrain the inital velocities at t-dt/2 */
2221 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2223 for (i = start; (i < end); i++)
2225 for (m = 0; (m < DIM); m++)
2227 /* Reverse the velocity */
2228 state->v[i][m] = -state->v[i][m];
2229 /* Store the position at t-dt in buf */
2230 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2233 /* Shake the positions at t=-dt with the positions at t=0
2234 * as reference coordinates.
2238 char buf[STEPSTRSIZE];
2239 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2240 gmx_step_str(step, buf));
2243 constr->apply(TRUE, FALSE,
2245 as_rvec_array(state->x.data()), savex, nullptr,
2247 state->lambda[efptBONDED], &dvdl_dum,
2248 as_rvec_array(state->v.data()), nullptr, ConstraintVariable::Positions);
2250 for (i = start; i < end; i++)
2252 for (m = 0; m < DIM; m++)
2254 /* Re-reverse the velocities */
2255 state->v[i][m] = -state->v[i][m];
2264 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2265 double *enerout, double *virout)
2267 double enersum, virsum;
2268 double invscale, invscale2, invscale3;
2269 double r, ea, eb, ec, pa, pb, pc, pd;
2274 invscale = 1.0/scale;
2275 invscale2 = invscale*invscale;
2276 invscale3 = invscale*invscale2;
2278 /* Following summation derived from cubic spline definition,
2279 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2280 * the cubic spline. We first calculate the negative of the
2281 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2282 * add the more standard, abrupt cutoff correction to that result,
2283 * yielding the long-range correction for a switched function. We
2284 * perform both the pressure and energy loops at the same time for
2285 * simplicity, as the computational cost is low. */
2289 /* Since the dispersion table has been scaled down a factor
2290 * 6.0 and the repulsion a factor 12.0 to compensate for the
2291 * c6/c12 parameters inside nbfp[] being scaled up (to save
2292 * flops in kernels), we need to correct for this.
2303 for (ri = rstart; ri < rend; ++ri)
2307 eb = 2.0*invscale2*r;
2311 pb = 3.0*invscale2*r;
2312 pc = 3.0*invscale*r*r;
2315 /* this "8" is from the packing in the vdwtab array - perhaps
2316 should be defined? */
2318 offset = 8*ri + offstart;
2319 y0 = vdwtab[offset];
2320 f = vdwtab[offset+1];
2321 g = vdwtab[offset+2];
2322 h = vdwtab[offset+3];
2324 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2325 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2327 *enerout = 4.0*M_PI*enersum*tabfactor;
2328 *virout = 4.0*M_PI*virsum*tabfactor;
2331 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2333 double eners[2], virs[2], enersum, virsum;
2334 double r0, rc3, rc9;
2336 real scale, *vdwtab;
2338 fr->enershiftsix = 0;
2339 fr->enershifttwelve = 0;
2340 fr->enerdiffsix = 0;
2341 fr->enerdifftwelve = 0;
2343 fr->virdifftwelve = 0;
2345 const interaction_const_t *ic = fr->ic;
2347 if (eDispCorr != edispcNO)
2349 for (i = 0; i < 2; i++)
2354 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2355 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2356 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2357 (ic->vdwtype == evdwSHIFT) ||
2358 (ic->vdwtype == evdwSWITCH))
2360 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2361 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2362 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2365 "With dispersion correction rvdw-switch can not be zero "
2366 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2369 /* TODO This code depends on the logic in tables.c that
2370 constructs the table layout, which should be made
2371 explicit in future cleanup. */
2372 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2373 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2374 scale = fr->dispersionCorrectionTable->scale;
2375 vdwtab = fr->dispersionCorrectionTable->data;
2377 /* Round the cut-offs to exact table values for precision */
2378 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2379 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2381 /* The code below has some support for handling force-switching, i.e.
2382 * when the force (instead of potential) is switched over a limited
2383 * region. This leads to a constant shift in the potential inside the
2384 * switching region, which we can handle by adding a constant energy
2385 * term in the force-switch case just like when we do potential-shift.
2387 * For now this is not enabled, but to keep the functionality in the
2388 * code we check separately for switch and shift. When we do force-switch
2389 * the shifting point is rvdw_switch, while it is the cutoff when we
2390 * have a classical potential-shift.
2392 * For a pure potential-shift the potential has a constant shift
2393 * all the way out to the cutoff, and that is it. For other forms
2394 * we need to calculate the constant shift up to the point where we
2395 * start modifying the potential.
2397 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2403 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2404 (ic->vdwtype == evdwSHIFT))
2406 /* Determine the constant energy shift below rvdw_switch.
2407 * Table has a scale factor since we have scaled it down to compensate
2408 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2410 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2411 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2413 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2415 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2416 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2419 /* Add the constant part from 0 to rvdw_switch.
2420 * This integration from 0 to rvdw_switch overcounts the number
2421 * of interactions by 1, as it also counts the self interaction.
2422 * We will correct for this later.
2424 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2425 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2427 /* Calculate the contribution in the range [r0,r1] where we
2428 * modify the potential. For a pure potential-shift modifier we will
2429 * have ri0==ri1, and there will not be any contribution here.
2431 for (i = 0; i < 2; i++)
2435 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2436 eners[i] -= enersum;
2440 /* Alright: Above we compensated by REMOVING the parts outside r0
2441 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2443 * Regardless of whether r0 is the point where we start switching,
2444 * or the cutoff where we calculated the constant shift, we include
2445 * all the parts we are missing out to infinity from r0 by
2446 * calculating the analytical dispersion correction.
2448 eners[0] += -4.0*M_PI/(3.0*rc3);
2449 eners[1] += 4.0*M_PI/(9.0*rc9);
2450 virs[0] += 8.0*M_PI/rc3;
2451 virs[1] += -16.0*M_PI/(3.0*rc9);
2453 else if (ic->vdwtype == evdwCUT ||
2454 EVDW_PME(ic->vdwtype) ||
2455 ic->vdwtype == evdwUSER)
2457 if (ic->vdwtype == evdwUSER && fplog)
2460 "WARNING: using dispersion correction with user tables\n");
2463 /* Note that with LJ-PME, the dispersion correction is multiplied
2464 * by the difference between the actual C6 and the value of C6
2465 * that would produce the combination rule.
2466 * This means the normal energy and virial difference formulas
2470 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2472 /* Contribution beyond the cut-off */
2473 eners[0] += -4.0*M_PI/(3.0*rc3);
2474 eners[1] += 4.0*M_PI/(9.0*rc9);
2475 if (ic->vdw_modifier == eintmodPOTSHIFT)
2477 /* Contribution within the cut-off */
2478 eners[0] += -4.0*M_PI/(3.0*rc3);
2479 eners[1] += 4.0*M_PI/(3.0*rc9);
2481 /* Contribution beyond the cut-off */
2482 virs[0] += 8.0*M_PI/rc3;
2483 virs[1] += -16.0*M_PI/(3.0*rc9);
2488 "Dispersion correction is not implemented for vdw-type = %s",
2489 evdw_names[ic->vdwtype]);
2492 /* When we deprecate the group kernels the code below can go too */
2493 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2495 /* Calculate self-interaction coefficient (assuming that
2496 * the reciprocal-space contribution is constant in the
2497 * region that contributes to the self-interaction).
2499 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2501 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2502 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2505 fr->enerdiffsix = eners[0];
2506 fr->enerdifftwelve = eners[1];
2507 /* The 0.5 is due to the Gromacs definition of the virial */
2508 fr->virdiffsix = 0.5*virs[0];
2509 fr->virdifftwelve = 0.5*virs[1];
2513 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2514 matrix box, real lambda, tensor pres, tensor virial,
2515 real *prescorr, real *enercorr, real *dvdlcorr)
2517 gmx_bool bCorrAll, bCorrPres;
2518 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2528 if (ir->eDispCorr != edispcNO)
2530 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2531 ir->eDispCorr == edispcAllEnerPres);
2532 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2533 ir->eDispCorr == edispcAllEnerPres);
2535 invvol = 1/det(box);
2538 /* Only correct for the interactions with the inserted molecule */
2539 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2544 dens = fr->numAtomsForDispersionCorrection*invvol;
2545 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2548 if (ir->efep == efepNO)
2550 avcsix = fr->avcsix[0];
2551 avctwelve = fr->avctwelve[0];
2555 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2556 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2559 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2560 *enercorr += avcsix*enerdiff;
2562 if (ir->efep != efepNO)
2564 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2568 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2569 *enercorr += avctwelve*enerdiff;
2570 if (fr->efep != efepNO)
2572 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2578 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2579 if (ir->eDispCorr == edispcAllEnerPres)
2581 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2583 /* The factor 2 is because of the Gromacs virial definition */
2584 spres = -2.0*invvol*svir*PRESFAC;
2586 for (m = 0; m < DIM; m++)
2588 virial[m][m] += svir;
2589 pres[m][m] += spres;
2594 /* Can't currently control when it prints, for now, just print when degugging */
2599 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2605 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2606 *enercorr, spres, svir);
2610 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2614 if (fr->efep != efepNO)
2616 *dvdlcorr += dvdlambda;
2621 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2622 const gmx_mtop_t *mtop, rvec x[],
2628 if (bFirst && fplog)
2630 fprintf(fplog, "Removing pbc first time\n");
2635 for (const gmx_molblock_t &molb : mtop->molblock)
2637 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2638 if (moltype.atoms.nr == 1 ||
2639 (!bFirst && moltype.cgs.nr == 1))
2641 /* Just one atom or charge group in the molecule, no PBC required */
2642 as += molb.nmol*moltype.atoms.nr;
2646 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2647 mk_graph_ilist(nullptr, moltype.ilist,
2648 0, moltype.atoms.nr, FALSE, FALSE, graph);
2650 for (mol = 0; mol < molb.nmol; mol++)
2652 mk_mshift(fplog, graph, ePBC, box, x+as);
2654 shift_self(graph, box, x+as);
2655 /* The molecule is whole now.
2656 * We don't need the second mk_mshift call as in do_pbc_first,
2657 * since we no longer need this graph.
2660 as += moltype.atoms.nr;
2668 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2669 const gmx_mtop_t *mtop, rvec x[])
2671 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2674 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2675 const gmx_mtop_t *mtop, rvec x[])
2677 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2680 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2683 nth = gmx_omp_nthreads_get(emntDefault);
2685 #pragma omp parallel for num_threads(nth) schedule(static)
2686 for (t = 0; t < nth; t++)
2690 size_t natoms = x.size();
2691 size_t offset = (natoms*t )/nth;
2692 size_t len = (natoms*(t + 1))/nth - offset;
2693 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2695 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2699 // TODO This can be cleaned up a lot, and move back to runner.cpp
2700 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2701 const t_inputrec *inputrec,
2702 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2703 gmx_walltime_accounting_t walltime_accounting,
2704 nonbonded_verlet_t *nbv,
2706 gmx_bool bWriteStat)
2708 t_nrnb *nrnb_tot = nullptr;
2710 double nbfs = 0, mflop = 0;
2711 double elapsed_time,
2712 elapsed_time_over_all_ranks,
2713 elapsed_time_over_all_threads,
2714 elapsed_time_over_all_threads_over_all_ranks;
2715 /* Control whether it is valid to print a report. Only the
2716 simulation master may print, but it should not do so if the run
2717 terminated e.g. before a scheduled reset step. This is
2718 complicated by the fact that PME ranks are unaware of the
2719 reason why they were sent a pmerecvqxFINISH. To avoid
2720 communication deadlocks, we always do the communication for the
2721 report, even if we've decided not to write the report, because
2722 how long it takes to finish the run is not important when we've
2723 decided not to report on the simulation performance.
2725 Further, we only report performance for dynamical integrators,
2726 because those are the only ones for which we plan to
2727 consider doing any optimizations. */
2728 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2730 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2732 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2733 printReport = false;
2740 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2741 cr->mpi_comm_mysim);
2749 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2750 elapsed_time_over_all_ranks = elapsed_time;
2751 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2752 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2756 /* reduce elapsed_time over all MPI ranks in the current simulation */
2757 MPI_Allreduce(&elapsed_time,
2758 &elapsed_time_over_all_ranks,
2759 1, MPI_DOUBLE, MPI_SUM,
2760 cr->mpi_comm_mysim);
2761 elapsed_time_over_all_ranks /= cr->nnodes;
2762 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2763 * current simulation. */
2764 MPI_Allreduce(&elapsed_time_over_all_threads,
2765 &elapsed_time_over_all_threads_over_all_ranks,
2766 1, MPI_DOUBLE, MPI_SUM,
2767 cr->mpi_comm_mysim);
2773 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2780 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2782 print_dd_statistics(cr, inputrec, fplog);
2785 /* TODO Move the responsibility for any scaling by thread counts
2786 * to the code that handled the thread region, so that there's a
2787 * mechanism to keep cycle counting working during the transition
2788 * to task parallelism. */
2789 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2790 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2791 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2792 auto cycle_sum(wallcycle_sum(cr, wcycle));
2796 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2797 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2798 if (pme_gpu_task_enabled(pme))
2800 pme_gpu_get_timings(pme, &pme_gpu_timings);
2802 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2803 elapsed_time_over_all_ranks,
2808 if (EI_DYNAMICS(inputrec->eI))
2810 delta_t = inputrec->delta_t;
2815 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2816 elapsed_time_over_all_ranks,
2817 walltime_accounting_get_nsteps_done(walltime_accounting),
2818 delta_t, nbfs, mflop);
2822 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2823 elapsed_time_over_all_ranks,
2824 walltime_accounting_get_nsteps_done(walltime_accounting),
2825 delta_t, nbfs, mflop);
2830 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2832 /* this function works, but could probably use a logic rewrite to keep all the different
2833 types of efep straight. */
2835 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2840 t_lambda *fep = ir->fepvals;
2841 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2842 if checkpoint is set -- a kludge is in for now
2845 for (int i = 0; i < efptNR; i++)
2847 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2848 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2850 lambda[i] = fep->init_lambda;
2853 lam0[i] = lambda[i];
2858 lambda[i] = fep->all_lambda[i][*fep_state];
2861 lam0[i] = lambda[i];
2867 /* need to rescale control temperatures to match current state */
2868 for (int i = 0; i < ir->opts.ngtc; i++)
2870 if (ir->opts.ref_t[i] > 0)
2872 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2877 /* Send to the log the information on the current lambdas */
2878 if (fplog != nullptr)
2880 fprintf(fplog, "Initial vector of lambda components:[ ");
2881 for (const auto &l : lambda)
2883 fprintf(fplog, "%10.4f ", l);
2885 fprintf(fplog, "]\n");
2890 void init_md(FILE *fplog,
2891 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2892 t_inputrec *ir, const gmx_output_env_t *oenv,
2893 const MdrunOptions &mdrunOptions,
2894 double *t, double *t0,
2895 t_state *globalState, double *lam0,
2896 t_nrnb *nrnb, gmx_mtop_t *mtop,
2898 gmx::BoxDeformation *deform,
2899 int nfile, const t_filenm fnm[],
2900 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2901 tensor force_vir, tensor shake_vir, rvec mu_tot,
2902 gmx_bool *bSimAnn, t_vcm **vcm,
2903 gmx_wallcycle_t wcycle)
2907 /* Initial values */
2908 *t = *t0 = ir->init_t;
2911 for (i = 0; i < ir->opts.ngtc; i++)
2913 /* set bSimAnn if any group is being annealed */
2914 if (ir->opts.annealing[i] != eannNO)
2920 /* Initialize lambda variables */
2921 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2922 * We currently need to call initialize_lambdas on non-master ranks
2923 * to initialize lam0.
2927 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2932 std::array<real, efptNR> tmpLambda;
2933 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2936 // TODO upd is never NULL in practice, but the analysers don't know that
2939 *upd = init_update(ir, deform);
2943 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2948 *vcm = init_vcm(fplog, &mtop->groups, ir);
2951 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2953 if (ir->etc == etcBERENDSEN)
2955 please_cite(fplog, "Berendsen84a");
2957 if (ir->etc == etcVRESCALE)
2959 please_cite(fplog, "Bussi2007a");
2961 if (ir->eI == eiSD1)
2963 please_cite(fplog, "Goga2012");
2970 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2972 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2973 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2976 /* Initiate variables */
2977 clear_mat(force_vir);
2978 clear_mat(shake_vir);