2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/calcvir.h"
73 #include "gromacs/mdlib/constr.h"
74 #include "gromacs/mdlib/force.h"
75 #include "gromacs/mdlib/forcerec.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/pbcutil/ishift.h"
93 #include "gromacs/pbcutil/mshift.h"
94 #include "gromacs/pbcutil/pbc.h"
95 #include "gromacs/pulling/pull.h"
96 #include "gromacs/pulling/pull_rotation.h"
97 #include "gromacs/timing/cyclecounter.h"
98 #include "gromacs/timing/gpu_timing.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/wallcyclereporting.h"
101 #include "gromacs/timing/walltime_accounting.h"
102 #include "gromacs/topology/topology.h"
103 #include "gromacs/utility/arrayref.h"
104 #include "gromacs/utility/basedefinitions.h"
105 #include "gromacs/utility/cstringutil.h"
106 #include "gromacs/utility/exceptions.h"
107 #include "gromacs/utility/fatalerror.h"
108 #include "gromacs/utility/gmxassert.h"
109 #include "gromacs/utility/gmxmpi.h"
110 #include "gromacs/utility/logger.h"
111 #include "gromacs/utility/pleasecite.h"
112 #include "gromacs/utility/smalloc.h"
113 #include "gromacs/utility/sysinfo.h"
115 #include "nbnxn_gpu.h"
116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
119 // TODO: this environment variable allows us to verify before release
120 // that on less common architectures the total cost of polling is not larger than
121 // a blocking wait (so polling does not introduce overhead when the static
122 // PME-first ordering would suffice).
123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
126 void print_time(FILE *out,
127 gmx_walltime_accounting_t walltime_accounting,
133 char timebuf[STRLEN];
134 double dt, elapsed_seconds, time_per_step;
143 fprintf(out, "step %s", gmx_step_str(step, buf));
146 if ((step >= ir->nstlist))
148 double seconds_since_epoch = gmx_gettime();
149 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
150 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
151 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
157 finish = static_cast<time_t>(seconds_since_epoch + dt);
158 gmx_ctime_r(&finish, timebuf, STRLEN);
159 sprintf(buf, "%s", timebuf);
160 buf[strlen(buf)-1] = '\0';
161 fprintf(out, ", will finish %s", buf);
165 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
170 fprintf(out, " performance: %.1f ns/day ",
171 ir->delta_t/1000*24*60*60/time_per_step);
180 GMX_UNUSED_VALUE(cr);
186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
189 char time_string[STRLEN];
198 char timebuf[STRLEN];
199 time_t temp_time = static_cast<time_t>(the_time);
201 gmx_ctime_r(&temp_time, timebuf, STRLEN);
202 for (i = 0; timebuf[i] >= ' '; i++)
204 time_string[i] = timebuf[i];
206 time_string[i] = '\0';
209 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
212 void print_start(FILE *fplog, const t_commrec *cr,
213 gmx_walltime_accounting_t walltime_accounting,
218 sprintf(buf, "Started %s", name);
219 print_date_and_time(fplog, cr->nodeid, buf,
220 walltime_accounting_get_start_time_stamp(walltime_accounting));
223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
225 const int end = forceToAdd.size();
227 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
228 #pragma omp parallel for num_threads(nt) schedule(static)
229 for (int i = 0; i < end; i++)
231 rvec_inc(f[i], forceToAdd[i]);
235 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
236 gmx::ForceWithVirial *forceWithVirial,
237 gmx::ArrayRef<const gmx::RVec> pmeForces,
238 gmx_enerdata_t *enerd,
242 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
243 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
244 forceWithVirial->addVirialContribution(vir_Q);
245 enerd->term[F_COUL_RECIP] += Vlr_q;
246 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
247 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
250 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
251 tensor vir_part, t_graph *graph, matrix box,
252 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
256 /* The short-range virial from surrounding boxes */
257 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
258 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
260 /* Calculate partial virial, for local atoms only, based on short range.
261 * Total virial is computed in global_stat, called from do_md
263 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
264 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
266 /* Add wall contribution */
267 for (i = 0; i < DIM; i++)
269 vir_part[i][ZZ] += fr->vir_wall_z[i];
274 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
278 static void pull_potential_wrapper(const t_commrec *cr,
279 const t_inputrec *ir,
280 matrix box, gmx::ArrayRef<const gmx::RVec> x,
281 gmx::ForceWithVirial *force,
282 const t_mdatoms *mdatoms,
283 gmx_enerdata_t *enerd,
286 gmx_wallcycle_t wcycle)
291 /* Calculate the center of mass forces, this requires communication,
292 * which is why pull_potential is called close to other communication.
294 wallcycle_start(wcycle, ewcPULLPOT);
295 set_pbc(&pbc, ir->ePBC, box);
297 enerd->term[F_COM_PULL] +=
298 pull_potential(ir->pull_work, mdatoms, &pbc,
299 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
300 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
301 wallcycle_stop(wcycle, ewcPULLPOT);
304 static void pme_receive_force_ener(const t_commrec *cr,
305 gmx::ForceWithVirial *forceWithVirial,
306 gmx_enerdata_t *enerd,
307 gmx_wallcycle_t wcycle)
309 real e_q, e_lj, dvdl_q, dvdl_lj;
310 float cycles_ppdpme, cycles_seppme;
312 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
313 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
315 /* In case of node-splitting, the PP nodes receive the long-range
316 * forces, virial and energy from the PME nodes here.
318 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
321 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
323 enerd->term[F_COUL_RECIP] += e_q;
324 enerd->term[F_LJ_RECIP] += e_lj;
325 enerd->dvdl_lin[efptCOUL] += dvdl_q;
326 enerd->dvdl_lin[efptVDW] += dvdl_lj;
330 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
332 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
335 static void print_large_forces(FILE *fp,
343 real force2Tolerance = gmx::square(forceTolerance);
344 std::uintmax_t numNonFinite = 0;
345 for (int i = 0; i < md->homenr; i++)
347 real force2 = norm2(f[i]);
348 bool nonFinite = !std::isfinite(force2);
349 if (force2 >= force2Tolerance || nonFinite)
351 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
353 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
360 if (numNonFinite > 0)
362 /* Note that with MPI this fatal call on one rank might interrupt
363 * the printing on other ranks. But we can only avoid that with
364 * an expensive MPI barrier that we would need at each step.
366 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
370 static void post_process_forces(const t_commrec *cr,
373 gmx_wallcycle_t wcycle,
374 const gmx_localtop_t *top,
378 gmx::ForceWithVirial *forceWithVirial,
380 const t_mdatoms *mdatoms,
383 const gmx_vsite_t *vsite,
386 if (fr->haveDirectVirialContributions)
388 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
392 /* Spread the mesh force on virtual sites to the other particles...
393 * This is parallellized. MPI communication is performed
394 * if the constructing atoms aren't local.
396 matrix virial = { { 0 } };
397 spread_vsite_f(vsite, x, fDirectVir, nullptr,
398 (flags & GMX_FORCE_VIRIAL), virial,
400 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
401 forceWithVirial->addVirialContribution(virial);
404 if (flags & GMX_FORCE_VIRIAL)
406 /* Now add the forces, this is local */
407 sum_forces(f, forceWithVirial->force_);
409 /* Add the direct virial contributions */
410 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
411 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
415 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
420 if (fr->print_force >= 0)
422 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
426 static void do_nb_verlet(t_forcerec *fr,
427 interaction_const_t *ic,
428 gmx_enerdata_t *enerd,
429 int flags, int ilocality,
433 gmx_wallcycle_t wcycle)
435 if (!(flags & GMX_FORCE_NONBONDED))
437 /* skip non-bonded calculation */
441 nonbonded_verlet_t *nbv = fr->nbv;
442 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
444 /* GPU kernel launch overhead is already timed separately */
445 if (fr->cutoff_scheme != ecutsVERLET)
447 gmx_incons("Invalid cut-off scheme passed!");
450 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
452 if (!bUsingGpuKernels)
454 /* When dynamic pair-list pruning is requested, we need to prune
455 * at nstlistPrune steps.
457 if (nbv->listParams->useDynamicPruning &&
458 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
460 /* Prune the pair-list beyond fr->ic->rlistPrune using
461 * the current coordinates of the atoms.
463 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
464 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
465 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
468 wallcycle_sub_start(wcycle, ewcsNONBONDED);
471 switch (nbvg->kernel_type)
473 case nbnxnk4x4_PlainC:
474 case nbnxnk4xN_SIMD_4xN:
475 case nbnxnk4xN_SIMD_2xNN:
476 nbnxn_kernel_cpu(nbvg,
483 enerd->grpp.ener[egCOULSR],
485 enerd->grpp.ener[egBHAMSR] :
486 enerd->grpp.ener[egLJSR]);
489 case nbnxnk8x8x8_GPU:
490 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
493 case nbnxnk8x8x8_PlainC:
494 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
501 enerd->grpp.ener[egCOULSR],
503 enerd->grpp.ener[egBHAMSR] :
504 enerd->grpp.ener[egLJSR]);
508 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
511 if (!bUsingGpuKernels)
513 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
516 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
517 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
519 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
521 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
522 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
524 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
528 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
530 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
531 if (flags & GMX_FORCE_ENERGY)
533 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
534 enr_nbnxn_kernel_ljc += 1;
535 enr_nbnxn_kernel_lj += 1;
538 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
539 nbvg->nbl_lists.natpair_ljq);
540 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
541 nbvg->nbl_lists.natpair_lj);
542 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
543 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
544 nbvg->nbl_lists.natpair_q);
546 if (ic->vdw_modifier == eintmodFORCESWITCH)
548 /* We add up the switch cost separately */
549 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
550 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
552 if (ic->vdw_modifier == eintmodPOTSWITCH)
554 /* We add up the switch cost separately */
555 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
556 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
558 if (ic->vdwtype == evdwPME)
560 /* We add up the LJ Ewald cost separately */
561 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
562 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
566 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
570 const t_mdatoms *mdatoms,
573 gmx_enerdata_t *enerd,
576 gmx_wallcycle_t wcycle)
579 nb_kernel_data_t kernel_data;
581 real dvdl_nb[efptNR];
586 /* Add short-range interactions */
587 donb_flags |= GMX_NONBONDED_DO_SR;
589 /* Currently all group scheme kernels always calculate (shift-)forces */
590 if (flags & GMX_FORCE_FORCES)
592 donb_flags |= GMX_NONBONDED_DO_FORCE;
594 if (flags & GMX_FORCE_VIRIAL)
596 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
598 if (flags & GMX_FORCE_ENERGY)
600 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
603 kernel_data.flags = donb_flags;
604 kernel_data.lambda = lambda;
605 kernel_data.dvdl = dvdl_nb;
607 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
608 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
610 /* reset free energy components */
611 for (i = 0; i < efptNR; i++)
616 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
618 wallcycle_sub_start(wcycle, ewcsNONBONDED);
619 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
620 for (th = 0; th < nbl_lists->nnbl; th++)
624 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
625 x, f, fr, mdatoms, &kernel_data, nrnb);
627 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
630 if (fepvals->sc_alpha != 0)
632 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
633 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
637 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
638 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
641 /* If we do foreign lambda and we have soft-core interactions
642 * we have to recalculate the (non-linear) energies contributions.
644 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
646 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
647 kernel_data.lambda = lam_i;
648 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
649 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
650 /* Note that we add to kernel_data.dvdl, but ignore the result */
652 for (i = 0; i < enerd->n_lambda; i++)
654 for (j = 0; j < efptNR; j++)
656 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
658 reset_foreign_enerdata(enerd);
659 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
660 for (th = 0; th < nbl_lists->nnbl; th++)
664 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
665 x, f, fr, mdatoms, &kernel_data, nrnb);
667 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
670 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
671 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
675 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
678 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
680 return nbv != nullptr && nbv->bUseGPU;
683 static inline void clear_rvecs_omp(int n, rvec v[])
685 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
687 /* Note that we would like to avoid this conditional by putting it
688 * into the omp pragma instead, but then we still take the full
689 * omp parallel for overhead (at least with gcc5).
693 for (int i = 0; i < n; i++)
700 #pragma omp parallel for num_threads(nth) schedule(static)
701 for (int i = 0; i < n; i++)
708 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
710 * \param groupOptions Group options, containing T-coupling options
712 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
714 real nrdfCoupled = 0;
715 real nrdfUncoupled = 0;
716 real kineticEnergy = 0;
717 for (int g = 0; g < groupOptions.ngtc; g++)
719 if (groupOptions.tau_t[g] >= 0)
721 nrdfCoupled += groupOptions.nrdf[g];
722 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
726 nrdfUncoupled += groupOptions.nrdf[g];
730 /* This conditional with > also catches nrdf=0 */
731 if (nrdfCoupled > nrdfUncoupled)
733 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
741 /*! \brief This routine checks that the potential energy is finite.
743 * Always checks that the potential energy is finite. If step equals
744 * inputrec.init_step also checks that the magnitude of the potential energy
745 * is reasonable. Terminates with a fatal error when a check fails.
746 * Note that passing this check does not guarantee finite forces,
747 * since those use slightly different arithmetics. But in most cases
748 * there is just a narrow coordinate range where forces are not finite
749 * and energies are finite.
751 * \param[in] step The step number, used for checking and printing
752 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
753 * \param[in] inputrec The input record
755 static void checkPotentialEnergyValidity(int64_t step,
756 const gmx_enerdata_t &enerd,
757 const t_inputrec &inputrec)
759 /* Threshold valid for comparing absolute potential energy against
760 * the kinetic energy. Normally one should not consider absolute
761 * potential energy values, but with a factor of one million
762 * we should never get false positives.
764 constexpr real c_thresholdFactor = 1e6;
766 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
767 real averageKineticEnergy = 0;
768 /* We only check for large potential energy at the initial step,
769 * because that is by far the most likely step for this too occur
770 * and because computing the average kinetic energy is not free.
771 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
772 * before they become NaN.
774 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
776 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
779 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
780 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
782 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
785 energyIsNotFinite ? "not finite" : "extremely high",
787 enerd.term[F_COUL_SR],
788 energyIsNotFinite ? "non-finite" : "very high",
789 energyIsNotFinite ? " or Nan" : "");
793 /*! \brief Compute forces and/or energies for special algorithms
795 * The intention is to collect all calls to algorithms that compute
796 * forces on local atoms only and that do not contribute to the local
797 * virial sum (but add their virial contribution separately).
798 * Eventually these should likely all become ForceProviders.
799 * Within this function the intention is to have algorithms that do
800 * global communication at the end, so global barriers within the MD loop
801 * are as close together as possible.
803 * \param[in] fplog The log file
804 * \param[in] cr The communication record
805 * \param[in] inputrec The input record
806 * \param[in] awh The Awh module (nullptr if none in use).
807 * \param[in] enforcedRotation Enforced rotation module.
808 * \param[in] step The current MD step
809 * \param[in] t The current time
810 * \param[in,out] wcycle Wallcycle accounting struct
811 * \param[in,out] forceProviders Pointer to a list of force providers
812 * \param[in] box The unit cell
813 * \param[in] x The coordinates
814 * \param[in] mdatoms Per atom properties
815 * \param[in] lambda Array of free-energy lambda values
816 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
817 * \param[in,out] forceWithVirial Force and virial buffers
818 * \param[in,out] enerd Energy buffer
819 * \param[in,out] ed Essential dynamics pointer
820 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
822 * \todo Remove bNS, which is used incorrectly.
823 * \todo Convert all other algorithms called here to ForceProviders.
826 computeSpecialForces(FILE *fplog,
828 const t_inputrec *inputrec,
830 gmx_enfrot *enforcedRotation,
833 gmx_wallcycle_t wcycle,
834 ForceProviders *forceProviders,
836 gmx::ArrayRef<const gmx::RVec> x,
837 const t_mdatoms *mdatoms,
840 gmx::ForceWithVirial *forceWithVirial,
841 gmx_enerdata_t *enerd,
845 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
847 /* NOTE: Currently all ForceProviders only provide forces.
848 * When they also provide energies, remove this conditional.
852 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
853 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
855 /* Collect forces from modules */
856 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
859 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
861 pull_potential_wrapper(cr, inputrec, box, x,
863 mdatoms, enerd, lambda, t,
868 enerd->term[F_COM_PULL] +=
869 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
871 t, step, wcycle, fplog);
875 rvec *f = as_rvec_array(forceWithVirial->force_.data());
877 /* Add the forces from enforced rotation potentials (if any) */
880 wallcycle_start(wcycle, ewcROTadd);
881 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
882 wallcycle_stop(wcycle, ewcROTadd);
887 /* Note that since init_edsam() is called after the initialization
888 * of forcerec, edsam doesn't request the noVirSum force buffer.
889 * Thus if no other algorithm (e.g. PME) requires it, the forces
890 * here will contribute to the virial.
892 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
895 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
896 if (inputrec->bIMD && computeForces)
898 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
902 /*! \brief Launch the prepare_step and spread stages of PME GPU.
904 * \param[in] pmedata The PME structure
905 * \param[in] box The box matrix
906 * \param[in] x Coordinate array
907 * \param[in] flags Force flags
908 * \param[in] wcycle The wallcycle structure
910 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
914 gmx_wallcycle_t wcycle)
916 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
917 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
918 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
920 pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
921 pme_gpu_launch_spread(pmedata, x, wcycle);
924 /*! \brief Launch the FFT and gather stages of PME GPU
926 * This function only implements setting the output forces (no accumulation).
928 * \param[in] pmedata The PME structure
929 * \param[in] wcycle The wallcycle structure
931 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
932 gmx_wallcycle_t wcycle)
934 pme_gpu_launch_complex_transforms(pmedata, wcycle);
935 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
939 * Polling wait for either of the PME or nonbonded GPU tasks.
941 * Instead of a static order in waiting for GPU tasks, this function
942 * polls checking which of the two tasks completes first, and does the
943 * associated force buffer reduction overlapped with the other task.
944 * By doing that, unlike static scheduling order, it can always overlap
945 * one of the reductions, regardless of the GPU task completion order.
947 * \param[in] nbv Nonbonded verlet structure
948 * \param[in] pmedata PME module data
949 * \param[in,out] force Force array to reduce task outputs into.
950 * \param[in,out] forceWithVirial Force and virial buffers
951 * \param[in,out] fshift Shift force output vector results are reduced into
952 * \param[in,out] enerd Energy data structure results are reduced into
953 * \param[in] flags Force flags
954 * \param[in] wcycle The wallcycle structure
956 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
957 const gmx_pme_t *pmedata,
958 gmx::PaddedArrayRef<gmx::RVec> *force,
959 gmx::ForceWithVirial *forceWithVirial,
961 gmx_enerdata_t *enerd,
963 gmx_wallcycle_t wcycle)
965 bool isPmeGpuDone = false;
966 bool isNbGpuDone = false;
969 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
971 while (!isPmeGpuDone || !isNbGpuDone)
978 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
979 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
980 vir_Q, &Vlr_q, completionType);
984 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
985 enerd, vir_Q, Vlr_q);
991 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
992 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
993 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
995 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
996 fshift, completionType);
997 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
998 // To get the call count right, when the task finished we
999 // issue a start/stop.
1000 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
1001 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
1004 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1005 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1007 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1008 nbv->nbat, as_rvec_array(force->data()), wcycle);
1015 * Launch the dynamic rolling pruning GPU task.
1017 * We currently alternate local/non-local list pruning in odd-even steps
1018 * (only pruning every second step without DD).
1020 * \param[in] cr The communication record
1021 * \param[in] nbv Nonbonded verlet structure
1022 * \param[in] inputrec The input record
1023 * \param[in] step The current MD step
1025 static inline void launchGpuRollingPruning(const t_commrec *cr,
1026 const nonbonded_verlet_t *nbv,
1027 const t_inputrec *inputrec,
1030 /* We should not launch the rolling pruning kernel at a search
1031 * step or just before search steps, since that's useless.
1032 * Without domain decomposition we prune at even steps.
1033 * With domain decomposition we alternate local and non-local
1034 * pruning at even and odd steps.
1036 int numRollingParts = nbv->listParams->numRollingParts;
1037 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1038 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1039 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1040 if (stepWithCurrentList > 0 &&
1041 stepWithCurrentList < inputrec->nstlist - 1 &&
1042 (stepIsEven || DOMAINDECOMP(cr)))
1044 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1045 stepIsEven ? eintLocal : eintNonlocal,
1050 static void do_force_cutsVERLET(FILE *fplog,
1051 const t_commrec *cr,
1052 const gmx_multisim_t *ms,
1053 const t_inputrec *inputrec,
1055 gmx_enfrot *enforcedRotation,
1058 gmx_wallcycle_t wcycle,
1059 const gmx_localtop_t *top,
1060 const gmx_groups_t * /* groups */,
1061 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1063 gmx::PaddedArrayRef<gmx::RVec> force,
1065 const t_mdatoms *mdatoms,
1066 gmx_enerdata_t *enerd, t_fcdata *fcd,
1070 interaction_const_t *ic,
1071 const gmx_vsite_t *vsite,
1074 const gmx_edsam *ed,
1076 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1077 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1081 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1082 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1083 rvec vzero, box_diag;
1084 float cycles_pme, cycles_wait_gpu;
1085 nonbonded_verlet_t *nbv = fr->nbv;
1087 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1088 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1089 bFillGrid = (bNS && bStateChanged);
1090 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1091 bDoForces = (flags & GMX_FORCE_FORCES);
1092 bUseGPU = fr->nbv->bUseGPU;
1093 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1095 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1096 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1097 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1098 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1100 /* At a search step we need to start the first balancing region
1101 * somewhere early inside the step after communication during domain
1102 * decomposition (and not during the previous step as usual).
1105 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1107 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1110 cycles_wait_gpu = 0;
1112 const int start = 0;
1113 const int homenr = mdatoms->homenr;
1115 clear_mat(vir_force);
1117 if (DOMAINDECOMP(cr))
1119 cg1 = cr->dd->globalAtomGroupIndices.size();
1132 update_forcerec(fr, box);
1134 if (inputrecNeedMutot(inputrec))
1136 /* Calculate total (local) dipole moment in a temporary common array.
1137 * This makes it possible to sum them over nodes faster.
1139 calc_mu(start, homenr,
1140 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1145 if (fr->ePBC != epbcNONE)
1147 /* Compute shift vectors every step,
1148 * because of pressure coupling or box deformation!
1150 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1152 calc_shifts(box, fr->shift_vec);
1157 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1158 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1160 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1162 unshift_self(graph, box, as_rvec_array(x.data()));
1166 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
1167 fr->shift_vec, nbv->nbat);
1170 if (!thisRankHasDuty(cr, DUTY_PME))
1172 /* Send particle coordinates to the pme nodes.
1173 * Since this is only implemented for domain decomposition
1174 * and domain decomposition does not use the graph,
1175 * we do not need to worry about shifting.
1177 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1178 lambda[efptCOUL], lambda[efptVDW],
1179 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1182 #endif /* GMX_MPI */
1186 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1189 /* do gridding for pair search */
1192 if (graph && bStateChanged)
1194 /* Calculate intramolecular shift vectors to make molecules whole */
1195 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1199 box_diag[XX] = box[XX][XX];
1200 box_diag[YY] = box[YY][YY];
1201 box_diag[ZZ] = box[ZZ][ZZ];
1203 wallcycle_start(wcycle, ewcNS);
1204 if (!DOMAINDECOMP(cr))
1206 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1207 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1209 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1211 nbv->grp[eintLocal].kernel_type,
1213 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1217 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1218 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1219 fr->cginfo, as_rvec_array(x.data()),
1220 nbv->grp[eintNonlocal].kernel_type,
1222 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1225 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1226 wallcycle_stop(wcycle, ewcNS);
1229 /* initialize the GPU atom data and copy shift vector */
1232 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1233 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1237 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1240 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1242 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1243 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1246 /* do local pair search */
1249 wallcycle_start_nocount(wcycle, ewcNS);
1250 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1251 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1253 nbv->listParams->rlistOuter,
1254 nbv->min_ci_balanced,
1255 &nbv->grp[eintLocal].nbl_lists,
1257 nbv->grp[eintLocal].kernel_type,
1259 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1260 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1262 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1264 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1268 /* initialize local pair-list on the GPU */
1269 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1270 nbv->grp[eintLocal].nbl_lists.nbl[0],
1273 wallcycle_stop(wcycle, ewcNS);
1277 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1283 if (DOMAINDECOMP(cr))
1285 ddOpenBalanceRegionGpu(cr->dd);
1288 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1289 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1290 /* launch local nonbonded work on GPU */
1291 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1292 step, nrnb, wcycle);
1293 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1294 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1299 // In PME GPU and mixed mode we launch FFT / gather after the
1300 // X copy/transform to allow overlap as well as after the GPU NB
1301 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1302 // the nonbonded kernel.
1303 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1306 /* Communicate coordinates and sum dipole if necessary +
1307 do non-local pair search */
1308 if (DOMAINDECOMP(cr))
1312 wallcycle_start_nocount(wcycle, ewcNS);
1313 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1315 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1317 nbv->listParams->rlistOuter,
1318 nbv->min_ci_balanced,
1319 &nbv->grp[eintNonlocal].nbl_lists,
1321 nbv->grp[eintNonlocal].kernel_type,
1323 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1324 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1326 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1328 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1330 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1332 /* initialize non-local pair-list on the GPU */
1333 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1334 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1337 wallcycle_stop(wcycle, ewcNS);
1341 dd_move_x(cr->dd, box, x, wcycle);
1343 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1349 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1350 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1351 /* launch non-local nonbonded tasks on GPU */
1352 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1353 step, nrnb, wcycle);
1354 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1355 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1361 /* launch D2H copy-back F */
1362 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1363 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1364 if (DOMAINDECOMP(cr))
1366 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1367 flags, eatNonlocal);
1369 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1371 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1372 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1375 if (bStateChanged && inputrecNeedMutot(inputrec))
1379 gmx_sumd(2*DIM, mu, cr);
1380 ddReopenBalanceRegionCpu(cr->dd);
1383 for (i = 0; i < 2; i++)
1385 for (j = 0; j < DIM; j++)
1387 fr->mu_tot[i][j] = mu[i*DIM + j];
1391 if (fr->efep == efepNO)
1393 copy_rvec(fr->mu_tot[0], mu_tot);
1397 for (j = 0; j < DIM; j++)
1400 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1401 lambda[efptCOUL]*fr->mu_tot[1][j];
1405 /* Reset energies */
1406 reset_enerdata(enerd);
1407 clear_rvecs(SHIFTS, fr->fshift);
1409 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1411 wallcycle_start(wcycle, ewcPPDURINGPME);
1412 dd_force_flop_start(cr->dd, nrnb);
1417 wallcycle_start(wcycle, ewcROT);
1418 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1419 wallcycle_stop(wcycle, ewcROT);
1422 /* Temporary solution until all routines take PaddedRVecVector */
1423 rvec *const f = as_rvec_array(force.data());
1425 /* Start the force cycle counter.
1426 * Note that a different counter is used for dynamic load balancing.
1428 wallcycle_start(wcycle, ewcFORCE);
1430 gmx::ArrayRef<gmx::RVec> forceRef = force;
1433 /* If we need to compute the virial, we might need a separate
1434 * force buffer for algorithms for which the virial is calculated
1435 * directly, such as PME.
1437 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1439 forceRef = *fr->forceBufferForDirectVirialContributions;
1441 /* TODO: update comment
1442 * We only compute forces on local atoms. Note that vsites can
1443 * spread to non-local atoms, but that part of the buffer is
1444 * cleared separately in the vsite spreading code.
1446 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1448 /* Clear the short- and long-range forces */
1449 clear_rvecs_omp(fr->natoms_force_constr, f);
1452 /* forceWithVirial uses the local atom range only */
1453 gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1455 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1457 clear_pull_forces(inputrec->pull_work);
1460 /* We calculate the non-bonded forces, when done on the CPU, here.
1461 * We do this before calling do_force_lowlevel, because in that
1462 * function, the listed forces are calculated before PME, which
1463 * does communication. With this order, non-bonded and listed
1464 * force calculation imbalance can be balanced out by the domain
1465 * decomposition load balancing.
1470 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1471 step, nrnb, wcycle);
1474 if (fr->efep != efepNO)
1476 /* Calculate the local and non-local free energy interactions here.
1477 * Happens here on the CPU both with and without GPU.
1479 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1481 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1482 fr, as_rvec_array(x.data()), f, mdatoms,
1483 inputrec->fepvals, lambda,
1484 enerd, flags, nrnb, wcycle);
1487 if (DOMAINDECOMP(cr) &&
1488 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1490 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1491 fr, as_rvec_array(x.data()), f, mdatoms,
1492 inputrec->fepvals, lambda,
1493 enerd, flags, nrnb, wcycle);
1501 if (DOMAINDECOMP(cr))
1503 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1504 step, nrnb, wcycle);
1513 aloc = eintNonlocal;
1516 /* Add all the non-bonded force to the normal force array.
1517 * This can be split into a local and a non-local part when overlapping
1518 * communication with calculation with domain decomposition.
1520 wallcycle_stop(wcycle, ewcFORCE);
1522 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1524 wallcycle_start_nocount(wcycle, ewcFORCE);
1526 /* if there are multiple fshift output buffers reduce them */
1527 if ((flags & GMX_FORCE_VIRIAL) &&
1528 nbv->grp[aloc].nbl_lists.nnbl > 1)
1530 /* This is not in a subcounter because it takes a
1531 negligible and constant-sized amount of time */
1532 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1537 /* update QMMMrec, if necessary */
1540 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1543 /* Compute the bonded and non-bonded energies and optionally forces */
1544 do_force_lowlevel(fr, inputrec, &(top->idef),
1545 cr, ms, nrnb, wcycle, mdatoms,
1546 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1547 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1548 flags, &cycles_pme);
1550 wallcycle_stop(wcycle, ewcFORCE);
1552 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1554 fr->forceProviders, box, x, mdatoms, lambda,
1555 flags, &forceWithVirial, enerd,
1560 /* wait for non-local forces (or calculate in emulation mode) */
1561 if (DOMAINDECOMP(cr))
1565 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1566 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1568 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1570 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1574 wallcycle_start_nocount(wcycle, ewcFORCE);
1575 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1576 step, nrnb, wcycle);
1577 wallcycle_stop(wcycle, ewcFORCE);
1580 /* skip the reduction if there was no non-local work to do */
1581 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1583 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1584 nbv->nbat, f, wcycle);
1589 if (DOMAINDECOMP(cr))
1591 /* We are done with the CPU compute.
1592 * We will now communicate the non-local forces.
1593 * If we use a GPU this will overlap with GPU work, so in that case
1594 * we do not close the DD force balancing region here.
1596 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1598 ddCloseBalanceRegionCpu(cr->dd);
1602 dd_move_f(cr->dd, force, fr->fshift, wcycle);
1606 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1607 // an alternating wait/reduction scheme.
1608 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1609 if (alternateGpuWait)
1611 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1614 if (!alternateGpuWait && useGpuPme)
1616 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1619 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1620 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1623 /* Wait for local GPU NB outputs on the non-alternating wait path */
1624 if (!alternateGpuWait && bUseGPU)
1626 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1627 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1628 * but even with a step of 0.1 ms the difference is less than 1%
1631 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1633 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1634 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1636 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1638 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1640 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1642 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1643 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1645 /* We measured few cycles, it could be that the kernel
1646 * and transfer finished earlier and there was no actual
1647 * wait time, only API call overhead.
1648 * Then the actual time could be anywhere between 0 and
1649 * cycles_wait_est. We will use half of cycles_wait_est.
1651 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1653 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1657 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1659 // NOTE: emulation kernel is not included in the balancing region,
1660 // but emulation mode does not target performance anyway
1661 wallcycle_start_nocount(wcycle, ewcFORCE);
1662 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1663 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1664 step, nrnb, wcycle);
1665 wallcycle_stop(wcycle, ewcFORCE);
1670 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1675 /* now clear the GPU outputs while we finish the step on the CPU */
1676 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1677 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1678 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1680 /* Is dynamic pair-list pruning activated? */
1681 if (nbv->listParams->useDynamicPruning)
1683 launchGpuRollingPruning(cr, nbv, inputrec, step);
1685 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1686 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1689 /* Do the nonbonded GPU (or emulation) force buffer reduction
1690 * on the non-alternating path. */
1691 if (bUseOrEmulGPU && !alternateGpuWait)
1693 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1694 nbv->nbat, f, wcycle);
1697 if (DOMAINDECOMP(cr))
1699 dd_force_flop_stop(cr->dd, nrnb);
1704 /* If we have NoVirSum forces, but we do not calculate the virial,
1705 * we sum fr->f_novirsum=f later.
1707 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1709 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1710 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1713 if (flags & GMX_FORCE_VIRIAL)
1715 /* Calculation of the virial must be done after vsites! */
1716 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1717 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1721 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1723 /* In case of node-splitting, the PP nodes receive the long-range
1724 * forces, virial and energy from the PME nodes here.
1726 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1731 post_process_forces(cr, step, nrnb, wcycle,
1732 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1733 vir_force, mdatoms, graph, fr, vsite,
1737 if (flags & GMX_FORCE_ENERGY)
1739 /* Sum the potential energy terms from group contributions */
1740 sum_epot(&(enerd->grpp), enerd->term);
1742 if (!EI_TPI(inputrec->eI))
1744 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1749 static void do_force_cutsGROUP(FILE *fplog,
1750 const t_commrec *cr,
1751 const gmx_multisim_t *ms,
1752 const t_inputrec *inputrec,
1754 gmx_enfrot *enforcedRotation,
1757 gmx_wallcycle_t wcycle,
1758 gmx_localtop_t *top,
1759 const gmx_groups_t *groups,
1760 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1762 gmx::PaddedArrayRef<gmx::RVec> force,
1764 const t_mdatoms *mdatoms,
1765 gmx_enerdata_t *enerd,
1770 const gmx_vsite_t *vsite,
1773 const gmx_edsam *ed,
1775 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1776 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1780 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1784 const int start = 0;
1785 const int homenr = mdatoms->homenr;
1787 clear_mat(vir_force);
1790 if (DOMAINDECOMP(cr))
1792 cg1 = cr->dd->globalAtomGroupIndices.size();
1803 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1804 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1805 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1806 bFillGrid = (bNS && bStateChanged);
1807 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1808 bDoForces = (flags & GMX_FORCE_FORCES);
1812 update_forcerec(fr, box);
1814 if (inputrecNeedMutot(inputrec))
1816 /* Calculate total (local) dipole moment in a temporary common array.
1817 * This makes it possible to sum them over nodes faster.
1819 calc_mu(start, homenr,
1820 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1825 if (fr->ePBC != epbcNONE)
1827 /* Compute shift vectors every step,
1828 * because of pressure coupling or box deformation!
1830 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1832 calc_shifts(box, fr->shift_vec);
1837 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1838 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1839 inc_nrnb(nrnb, eNR_CGCM, homenr);
1840 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1842 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1844 unshift_self(graph, box, as_rvec_array(x.data()));
1849 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1850 inc_nrnb(nrnb, eNR_CGCM, homenr);
1853 if (bCalcCGCM && gmx_debug_at)
1855 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1859 if (!thisRankHasDuty(cr, DUTY_PME))
1861 /* Send particle coordinates to the pme nodes.
1862 * Since this is only implemented for domain decomposition
1863 * and domain decomposition does not use the graph,
1864 * we do not need to worry about shifting.
1866 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1867 lambda[efptCOUL], lambda[efptVDW],
1868 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1871 #endif /* GMX_MPI */
1873 /* Communicate coordinates and sum dipole if necessary */
1874 if (DOMAINDECOMP(cr))
1876 dd_move_x(cr->dd, box, x, wcycle);
1878 /* No GPU support, no move_x overlap, so reopen the balance region here */
1879 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1881 ddReopenBalanceRegionCpu(cr->dd);
1885 if (inputrecNeedMutot(inputrec))
1891 gmx_sumd(2*DIM, mu, cr);
1892 ddReopenBalanceRegionCpu(cr->dd);
1894 for (i = 0; i < 2; i++)
1896 for (j = 0; j < DIM; j++)
1898 fr->mu_tot[i][j] = mu[i*DIM + j];
1902 if (fr->efep == efepNO)
1904 copy_rvec(fr->mu_tot[0], mu_tot);
1908 for (j = 0; j < DIM; j++)
1911 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1916 /* Reset energies */
1917 reset_enerdata(enerd);
1918 clear_rvecs(SHIFTS, fr->fshift);
1922 wallcycle_start(wcycle, ewcNS);
1924 if (graph && bStateChanged)
1926 /* Calculate intramolecular shift vectors to make molecules whole */
1927 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1930 /* Do the actual neighbour searching */
1932 groups, top, mdatoms,
1933 cr, nrnb, bFillGrid);
1935 wallcycle_stop(wcycle, ewcNS);
1938 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1940 wallcycle_start(wcycle, ewcPPDURINGPME);
1941 dd_force_flop_start(cr->dd, nrnb);
1946 wallcycle_start(wcycle, ewcROT);
1947 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1948 wallcycle_stop(wcycle, ewcROT);
1951 /* Temporary solution until all routines take PaddedRVecVector */
1952 rvec *f = as_rvec_array(force.data());
1954 /* Start the force cycle counter.
1955 * Note that a different counter is used for dynamic load balancing.
1957 wallcycle_start(wcycle, ewcFORCE);
1959 gmx::ArrayRef<gmx::RVec> forceRef = force;
1962 /* If we need to compute the virial, we might need a separate
1963 * force buffer for algorithms for which the virial is calculated
1964 * separately, such as PME.
1966 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1968 forceRef = *fr->forceBufferForDirectVirialContributions;
1970 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1973 /* Clear the short- and long-range forces */
1974 clear_rvecs(fr->natoms_force_constr, f);
1977 /* forceWithVirial might need the full force atom range */
1978 gmx::ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
1980 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1982 clear_pull_forces(inputrec->pull_work);
1985 /* update QMMMrec, if necessary */
1988 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1991 /* Compute the bonded and non-bonded energies and optionally forces */
1992 do_force_lowlevel(fr, inputrec, &(top->idef),
1993 cr, ms, nrnb, wcycle, mdatoms,
1994 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1995 box, inputrec->fepvals, lambda,
1996 graph, &(top->excls), fr->mu_tot,
2000 wallcycle_stop(wcycle, ewcFORCE);
2002 if (DOMAINDECOMP(cr))
2004 dd_force_flop_stop(cr->dd, nrnb);
2006 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2008 ddCloseBalanceRegionCpu(cr->dd);
2012 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2014 fr->forceProviders, box, x, mdatoms, lambda,
2015 flags, &forceWithVirial, enerd,
2020 /* Communicate the forces */
2021 if (DOMAINDECOMP(cr))
2023 dd_move_f(cr->dd, force, fr->fshift, wcycle);
2024 /* Do we need to communicate the separate force array
2025 * for terms that do not contribute to the single sum virial?
2026 * Position restraints and electric fields do not introduce
2027 * inter-cg forces, only full electrostatics methods do.
2028 * When we do not calculate the virial, fr->f_novirsum = f,
2029 * so we have already communicated these forces.
2031 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2032 (flags & GMX_FORCE_VIRIAL))
2034 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2038 /* If we have NoVirSum forces, but we do not calculate the virial,
2039 * we sum fr->f_novirsum=f later.
2041 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2043 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2044 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2047 if (flags & GMX_FORCE_VIRIAL)
2049 /* Calculation of the virial must be done after vsites! */
2050 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2051 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2055 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2057 /* In case of node-splitting, the PP nodes receive the long-range
2058 * forces, virial and energy from the PME nodes here.
2060 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2065 post_process_forces(cr, step, nrnb, wcycle,
2066 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2067 vir_force, mdatoms, graph, fr, vsite,
2071 if (flags & GMX_FORCE_ENERGY)
2073 /* Sum the potential energy terms from group contributions */
2074 sum_epot(&(enerd->grpp), enerd->term);
2076 if (!EI_TPI(inputrec->eI))
2078 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2084 void do_force(FILE *fplog,
2085 const t_commrec *cr,
2086 const gmx_multisim_t *ms,
2087 const t_inputrec *inputrec,
2089 gmx_enfrot *enforcedRotation,
2092 gmx_wallcycle_t wcycle,
2093 gmx_localtop_t *top,
2094 const gmx_groups_t *groups,
2096 gmx::PaddedArrayRef<gmx::RVec> x,
2098 gmx::PaddedArrayRef<gmx::RVec> force,
2100 const t_mdatoms *mdatoms,
2101 gmx_enerdata_t *enerd,
2103 gmx::ArrayRef<real> lambda,
2106 const gmx_vsite_t *vsite,
2109 const gmx_edsam *ed,
2111 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2112 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2114 /* modify force flag if not doing nonbonded */
2115 if (!fr->bNonbonded)
2117 flags &= ~GMX_FORCE_NONBONDED;
2120 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2121 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2123 switch (inputrec->cutoff_scheme)
2126 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2127 awh, enforcedRotation, step, nrnb, wcycle,
2134 lambda.data(), graph,
2139 ddOpenBalanceRegion,
2140 ddCloseBalanceRegion);
2143 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2144 awh, enforcedRotation, step, nrnb, wcycle,
2151 lambda.data(), graph,
2155 ddOpenBalanceRegion,
2156 ddCloseBalanceRegion);
2159 gmx_incons("Invalid cut-off scheme passed!");
2162 /* In case we don't have constraints and are using GPUs, the next balancing
2163 * region starts here.
2164 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2165 * virial calculation and COM pulling, is not thus not included in
2166 * the balance timing, which is ok as most tasks do communication.
2168 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2170 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2175 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2176 t_inputrec *ir, t_mdatoms *md,
2179 int i, m, start, end;
2181 real dt = ir->delta_t;
2185 /* We need to allocate one element extra, since we might use
2186 * (unaligned) 4-wide SIMD loads to access rvec entries.
2188 snew(savex, state->natoms + 1);
2195 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2196 start, md->homenr, end);
2198 /* Do a first constrain to reset particles... */
2199 step = ir->init_step;
2202 char buf[STEPSTRSIZE];
2203 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2204 gmx_step_str(step, buf));
2208 /* constrain the current position */
2209 constr->apply(TRUE, FALSE,
2211 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2213 state->lambda[efptBONDED], &dvdl_dum,
2214 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2217 /* constrain the inital velocity, and save it */
2218 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2219 constr->apply(TRUE, FALSE,
2221 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2223 state->lambda[efptBONDED], &dvdl_dum,
2224 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2226 /* constrain the inital velocities at t-dt/2 */
2227 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2229 for (i = start; (i < end); i++)
2231 for (m = 0; (m < DIM); m++)
2233 /* Reverse the velocity */
2234 state->v[i][m] = -state->v[i][m];
2235 /* Store the position at t-dt in buf */
2236 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2239 /* Shake the positions at t=-dt with the positions at t=0
2240 * as reference coordinates.
2244 char buf[STEPSTRSIZE];
2245 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2246 gmx_step_str(step, buf));
2249 constr->apply(TRUE, FALSE,
2251 as_rvec_array(state->x.data()), savex, nullptr,
2253 state->lambda[efptBONDED], &dvdl_dum,
2254 as_rvec_array(state->v.data()), nullptr, gmx::ConstraintVariable::Positions);
2256 for (i = start; i < end; i++)
2258 for (m = 0; m < DIM; m++)
2260 /* Re-reverse the velocities */
2261 state->v[i][m] = -state->v[i][m];
2270 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2271 double *enerout, double *virout)
2273 double enersum, virsum;
2274 double invscale, invscale2, invscale3;
2275 double r, ea, eb, ec, pa, pb, pc, pd;
2280 invscale = 1.0/scale;
2281 invscale2 = invscale*invscale;
2282 invscale3 = invscale*invscale2;
2284 /* Following summation derived from cubic spline definition,
2285 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2286 * the cubic spline. We first calculate the negative of the
2287 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2288 * add the more standard, abrupt cutoff correction to that result,
2289 * yielding the long-range correction for a switched function. We
2290 * perform both the pressure and energy loops at the same time for
2291 * simplicity, as the computational cost is low. */
2295 /* Since the dispersion table has been scaled down a factor
2296 * 6.0 and the repulsion a factor 12.0 to compensate for the
2297 * c6/c12 parameters inside nbfp[] being scaled up (to save
2298 * flops in kernels), we need to correct for this.
2309 for (ri = rstart; ri < rend; ++ri)
2313 eb = 2.0*invscale2*r;
2317 pb = 3.0*invscale2*r;
2318 pc = 3.0*invscale*r*r;
2321 /* this "8" is from the packing in the vdwtab array - perhaps
2322 should be defined? */
2324 offset = 8*ri + offstart;
2325 y0 = vdwtab[offset];
2326 f = vdwtab[offset+1];
2327 g = vdwtab[offset+2];
2328 h = vdwtab[offset+3];
2330 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2331 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2333 *enerout = 4.0*M_PI*enersum*tabfactor;
2334 *virout = 4.0*M_PI*virsum*tabfactor;
2337 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2339 double eners[2], virs[2], enersum, virsum;
2340 double r0, rc3, rc9;
2342 real scale, *vdwtab;
2344 fr->enershiftsix = 0;
2345 fr->enershifttwelve = 0;
2346 fr->enerdiffsix = 0;
2347 fr->enerdifftwelve = 0;
2349 fr->virdifftwelve = 0;
2351 const interaction_const_t *ic = fr->ic;
2353 if (eDispCorr != edispcNO)
2355 for (i = 0; i < 2; i++)
2360 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2361 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2362 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2363 (ic->vdwtype == evdwSHIFT) ||
2364 (ic->vdwtype == evdwSWITCH))
2366 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2367 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2368 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2371 "With dispersion correction rvdw-switch can not be zero "
2372 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2375 /* TODO This code depends on the logic in tables.c that
2376 constructs the table layout, which should be made
2377 explicit in future cleanup. */
2378 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2379 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2380 scale = fr->dispersionCorrectionTable->scale;
2381 vdwtab = fr->dispersionCorrectionTable->data;
2383 /* Round the cut-offs to exact table values for precision */
2384 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2385 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2387 /* The code below has some support for handling force-switching, i.e.
2388 * when the force (instead of potential) is switched over a limited
2389 * region. This leads to a constant shift in the potential inside the
2390 * switching region, which we can handle by adding a constant energy
2391 * term in the force-switch case just like when we do potential-shift.
2393 * For now this is not enabled, but to keep the functionality in the
2394 * code we check separately for switch and shift. When we do force-switch
2395 * the shifting point is rvdw_switch, while it is the cutoff when we
2396 * have a classical potential-shift.
2398 * For a pure potential-shift the potential has a constant shift
2399 * all the way out to the cutoff, and that is it. For other forms
2400 * we need to calculate the constant shift up to the point where we
2401 * start modifying the potential.
2403 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2409 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2410 (ic->vdwtype == evdwSHIFT))
2412 /* Determine the constant energy shift below rvdw_switch.
2413 * Table has a scale factor since we have scaled it down to compensate
2414 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2416 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2417 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2419 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2421 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2422 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2425 /* Add the constant part from 0 to rvdw_switch.
2426 * This integration from 0 to rvdw_switch overcounts the number
2427 * of interactions by 1, as it also counts the self interaction.
2428 * We will correct for this later.
2430 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2431 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2433 /* Calculate the contribution in the range [r0,r1] where we
2434 * modify the potential. For a pure potential-shift modifier we will
2435 * have ri0==ri1, and there will not be any contribution here.
2437 for (i = 0; i < 2; i++)
2441 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2442 eners[i] -= enersum;
2446 /* Alright: Above we compensated by REMOVING the parts outside r0
2447 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2449 * Regardless of whether r0 is the point where we start switching,
2450 * or the cutoff where we calculated the constant shift, we include
2451 * all the parts we are missing out to infinity from r0 by
2452 * calculating the analytical dispersion correction.
2454 eners[0] += -4.0*M_PI/(3.0*rc3);
2455 eners[1] += 4.0*M_PI/(9.0*rc9);
2456 virs[0] += 8.0*M_PI/rc3;
2457 virs[1] += -16.0*M_PI/(3.0*rc9);
2459 else if (ic->vdwtype == evdwCUT ||
2460 EVDW_PME(ic->vdwtype) ||
2461 ic->vdwtype == evdwUSER)
2463 if (ic->vdwtype == evdwUSER && fplog)
2466 "WARNING: using dispersion correction with user tables\n");
2469 /* Note that with LJ-PME, the dispersion correction is multiplied
2470 * by the difference between the actual C6 and the value of C6
2471 * that would produce the combination rule.
2472 * This means the normal energy and virial difference formulas
2476 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2478 /* Contribution beyond the cut-off */
2479 eners[0] += -4.0*M_PI/(3.0*rc3);
2480 eners[1] += 4.0*M_PI/(9.0*rc9);
2481 if (ic->vdw_modifier == eintmodPOTSHIFT)
2483 /* Contribution within the cut-off */
2484 eners[0] += -4.0*M_PI/(3.0*rc3);
2485 eners[1] += 4.0*M_PI/(3.0*rc9);
2487 /* Contribution beyond the cut-off */
2488 virs[0] += 8.0*M_PI/rc3;
2489 virs[1] += -16.0*M_PI/(3.0*rc9);
2494 "Dispersion correction is not implemented for vdw-type = %s",
2495 evdw_names[ic->vdwtype]);
2498 /* When we deprecate the group kernels the code below can go too */
2499 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2501 /* Calculate self-interaction coefficient (assuming that
2502 * the reciprocal-space contribution is constant in the
2503 * region that contributes to the self-interaction).
2505 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2507 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2508 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2511 fr->enerdiffsix = eners[0];
2512 fr->enerdifftwelve = eners[1];
2513 /* The 0.5 is due to the Gromacs definition of the virial */
2514 fr->virdiffsix = 0.5*virs[0];
2515 fr->virdifftwelve = 0.5*virs[1];
2519 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2520 matrix box, real lambda, tensor pres, tensor virial,
2521 real *prescorr, real *enercorr, real *dvdlcorr)
2523 gmx_bool bCorrAll, bCorrPres;
2524 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2534 if (ir->eDispCorr != edispcNO)
2536 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2537 ir->eDispCorr == edispcAllEnerPres);
2538 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2539 ir->eDispCorr == edispcAllEnerPres);
2541 invvol = 1/det(box);
2544 /* Only correct for the interactions with the inserted molecule */
2545 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2550 dens = fr->numAtomsForDispersionCorrection*invvol;
2551 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2554 if (ir->efep == efepNO)
2556 avcsix = fr->avcsix[0];
2557 avctwelve = fr->avctwelve[0];
2561 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2562 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2565 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2566 *enercorr += avcsix*enerdiff;
2568 if (ir->efep != efepNO)
2570 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2574 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2575 *enercorr += avctwelve*enerdiff;
2576 if (fr->efep != efepNO)
2578 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2584 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2585 if (ir->eDispCorr == edispcAllEnerPres)
2587 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2589 /* The factor 2 is because of the Gromacs virial definition */
2590 spres = -2.0*invvol*svir*PRESFAC;
2592 for (m = 0; m < DIM; m++)
2594 virial[m][m] += svir;
2595 pres[m][m] += spres;
2600 /* Can't currently control when it prints, for now, just print when degugging */
2605 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2611 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2612 *enercorr, spres, svir);
2616 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2620 if (fr->efep != efepNO)
2622 *dvdlcorr += dvdlambda;
2627 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2628 const gmx_mtop_t *mtop, rvec x[],
2634 if (bFirst && fplog)
2636 fprintf(fplog, "Removing pbc first time\n");
2641 for (const gmx_molblock_t &molb : mtop->molblock)
2643 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2644 if (moltype.atoms.nr == 1 ||
2645 (!bFirst && moltype.cgs.nr == 1))
2647 /* Just one atom or charge group in the molecule, no PBC required */
2648 as += molb.nmol*moltype.atoms.nr;
2652 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2653 mk_graph_ilist(nullptr, moltype.ilist,
2654 0, moltype.atoms.nr, FALSE, FALSE, graph);
2656 for (mol = 0; mol < molb.nmol; mol++)
2658 mk_mshift(fplog, graph, ePBC, box, x+as);
2660 shift_self(graph, box, x+as);
2661 /* The molecule is whole now.
2662 * We don't need the second mk_mshift call as in do_pbc_first,
2663 * since we no longer need this graph.
2666 as += moltype.atoms.nr;
2674 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2675 const gmx_mtop_t *mtop, rvec x[])
2677 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2680 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2681 const gmx_mtop_t *mtop, rvec x[])
2683 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2686 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2689 nth = gmx_omp_nthreads_get(emntDefault);
2691 #pragma omp parallel for num_threads(nth) schedule(static)
2692 for (t = 0; t < nth; t++)
2696 size_t natoms = x.size();
2697 size_t offset = (natoms*t )/nth;
2698 size_t len = (natoms*(t + 1))/nth - offset;
2699 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2701 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2705 // TODO This can be cleaned up a lot, and move back to runner.cpp
2706 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2707 const t_inputrec *inputrec,
2708 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2709 gmx_walltime_accounting_t walltime_accounting,
2710 nonbonded_verlet_t *nbv,
2712 gmx_bool bWriteStat)
2714 t_nrnb *nrnb_tot = nullptr;
2716 double nbfs = 0, mflop = 0;
2717 double elapsed_time,
2718 elapsed_time_over_all_ranks,
2719 elapsed_time_over_all_threads,
2720 elapsed_time_over_all_threads_over_all_ranks;
2721 /* Control whether it is valid to print a report. Only the
2722 simulation master may print, but it should not do so if the run
2723 terminated e.g. before a scheduled reset step. This is
2724 complicated by the fact that PME ranks are unaware of the
2725 reason why they were sent a pmerecvqxFINISH. To avoid
2726 communication deadlocks, we always do the communication for the
2727 report, even if we've decided not to write the report, because
2728 how long it takes to finish the run is not important when we've
2729 decided not to report on the simulation performance.
2731 Further, we only report performance for dynamical integrators,
2732 because those are the only ones for which we plan to
2733 consider doing any optimizations. */
2734 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2736 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2738 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2739 printReport = false;
2746 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2747 cr->mpi_comm_mysim);
2755 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2756 elapsed_time_over_all_ranks = elapsed_time;
2757 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2758 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2762 /* reduce elapsed_time over all MPI ranks in the current simulation */
2763 MPI_Allreduce(&elapsed_time,
2764 &elapsed_time_over_all_ranks,
2765 1, MPI_DOUBLE, MPI_SUM,
2766 cr->mpi_comm_mysim);
2767 elapsed_time_over_all_ranks /= cr->nnodes;
2768 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2769 * current simulation. */
2770 MPI_Allreduce(&elapsed_time_over_all_threads,
2771 &elapsed_time_over_all_threads_over_all_ranks,
2772 1, MPI_DOUBLE, MPI_SUM,
2773 cr->mpi_comm_mysim);
2779 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2786 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2788 print_dd_statistics(cr, inputrec, fplog);
2791 /* TODO Move the responsibility for any scaling by thread counts
2792 * to the code that handled the thread region, so that there's a
2793 * mechanism to keep cycle counting working during the transition
2794 * to task parallelism. */
2795 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2796 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2797 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2798 auto cycle_sum(wallcycle_sum(cr, wcycle));
2802 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2803 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2804 if (pme_gpu_task_enabled(pme))
2806 pme_gpu_get_timings(pme, &pme_gpu_timings);
2808 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2809 elapsed_time_over_all_ranks,
2814 if (EI_DYNAMICS(inputrec->eI))
2816 delta_t = inputrec->delta_t;
2821 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2822 elapsed_time_over_all_ranks,
2823 walltime_accounting_get_nsteps_done(walltime_accounting),
2824 delta_t, nbfs, mflop);
2828 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2829 elapsed_time_over_all_ranks,
2830 walltime_accounting_get_nsteps_done(walltime_accounting),
2831 delta_t, nbfs, mflop);
2836 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2838 /* this function works, but could probably use a logic rewrite to keep all the different
2839 types of efep straight. */
2841 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2846 t_lambda *fep = ir->fepvals;
2847 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2848 if checkpoint is set -- a kludge is in for now
2851 for (int i = 0; i < efptNR; i++)
2853 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2854 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2856 lambda[i] = fep->init_lambda;
2859 lam0[i] = lambda[i];
2864 lambda[i] = fep->all_lambda[i][*fep_state];
2867 lam0[i] = lambda[i];
2873 /* need to rescale control temperatures to match current state */
2874 for (int i = 0; i < ir->opts.ngtc; i++)
2876 if (ir->opts.ref_t[i] > 0)
2878 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2883 /* Send to the log the information on the current lambdas */
2884 if (fplog != nullptr)
2886 fprintf(fplog, "Initial vector of lambda components:[ ");
2887 for (const auto &l : lambda)
2889 fprintf(fplog, "%10.4f ", l);
2891 fprintf(fplog, "]\n");
2896 void init_md(FILE *fplog,
2897 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2898 t_inputrec *ir, const gmx_output_env_t *oenv,
2899 const MdrunOptions &mdrunOptions,
2900 double *t, double *t0,
2901 t_state *globalState, double *lam0,
2902 t_nrnb *nrnb, gmx_mtop_t *mtop,
2904 gmx::BoxDeformation *deform,
2905 int nfile, const t_filenm fnm[],
2906 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2907 tensor force_vir, tensor shake_vir, rvec mu_tot,
2908 gmx_bool *bSimAnn, t_vcm **vcm,
2909 gmx_wallcycle_t wcycle)
2913 /* Initial values */
2914 *t = *t0 = ir->init_t;
2917 for (i = 0; i < ir->opts.ngtc; i++)
2919 /* set bSimAnn if any group is being annealed */
2920 if (ir->opts.annealing[i] != eannNO)
2926 /* Initialize lambda variables */
2927 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2928 * We currently need to call initialize_lambdas on non-master ranks
2929 * to initialize lam0.
2933 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2938 std::array<real, efptNR> tmpLambda;
2939 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2942 // TODO upd is never NULL in practice, but the analysers don't know that
2945 *upd = init_update(ir, deform);
2949 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2954 *vcm = init_vcm(fplog, &mtop->groups, ir);
2957 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2959 if (ir->etc == etcBERENDSEN)
2961 please_cite(fplog, "Berendsen84a");
2963 if (ir->etc == etcVRESCALE)
2965 please_cite(fplog, "Bussi2007a");
2967 if (ir->eI == eiSD1)
2969 please_cite(fplog, "Goga2012");
2976 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2978 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2979 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2982 /* Initiate variables */
2983 clear_mat(force_vir);
2984 clear_mat(shake_vir);