2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/awh/awh.h"
51 #include "gromacs/domdec/dlbtiming.h"
52 #include "gromacs/domdec/domdec.h"
53 #include "gromacs/domdec/domdec_struct.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/calcvir.h"
73 #include "gromacs/mdlib/constr.h"
74 #include "gromacs/mdlib/force.h"
75 #include "gromacs/mdlib/forcerec.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/iforceprovider.h"
89 #include "gromacs/mdtypes/inputrec.h"
90 #include "gromacs/mdtypes/md_enums.h"
91 #include "gromacs/mdtypes/state.h"
92 #include "gromacs/pbcutil/ishift.h"
93 #include "gromacs/pbcutil/mshift.h"
94 #include "gromacs/pbcutil/pbc.h"
95 #include "gromacs/pulling/pull.h"
96 #include "gromacs/pulling/pull_rotation.h"
97 #include "gromacs/timing/cyclecounter.h"
98 #include "gromacs/timing/gpu_timing.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/wallcyclereporting.h"
101 #include "gromacs/timing/walltime_accounting.h"
102 #include "gromacs/topology/topology.h"
103 #include "gromacs/utility/arrayref.h"
104 #include "gromacs/utility/basedefinitions.h"
105 #include "gromacs/utility/cstringutil.h"
106 #include "gromacs/utility/exceptions.h"
107 #include "gromacs/utility/fatalerror.h"
108 #include "gromacs/utility/gmxassert.h"
109 #include "gromacs/utility/gmxmpi.h"
110 #include "gromacs/utility/logger.h"
111 #include "gromacs/utility/pleasecite.h"
112 #include "gromacs/utility/smalloc.h"
113 #include "gromacs/utility/sysinfo.h"
115 #include "nbnxn_gpu.h"
116 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
117 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
119 // TODO: this environment variable allows us to verify before release
120 // that on less common architectures the total cost of polling is not larger than
121 // a blocking wait (so polling does not introduce overhead when the static
122 // PME-first ordering would suffice).
123 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
126 void print_time(FILE *out,
127 gmx_walltime_accounting_t walltime_accounting,
129 const t_inputrec *ir,
133 char timebuf[STRLEN];
134 double dt, elapsed_seconds, time_per_step;
143 fprintf(out, "step %s", gmx_step_str(step, buf));
146 if ((step >= ir->nstlist))
148 double seconds_since_epoch = gmx_gettime();
149 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
150 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
151 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
157 finish = static_cast<time_t>(seconds_since_epoch + dt);
158 gmx_ctime_r(&finish, timebuf, STRLEN);
159 sprintf(buf, "%s", timebuf);
160 buf[strlen(buf)-1] = '\0';
161 fprintf(out, ", will finish %s", buf);
165 fprintf(out, ", remaining wall clock time: %5d s ", static_cast<int>(dt));
170 fprintf(out, " performance: %.1f ns/day ",
171 ir->delta_t/1000*24*60*60/time_per_step);
180 GMX_UNUSED_VALUE(cr);
186 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
189 char time_string[STRLEN];
198 char timebuf[STRLEN];
199 time_t temp_time = static_cast<time_t>(the_time);
201 gmx_ctime_r(&temp_time, timebuf, STRLEN);
202 for (i = 0; timebuf[i] >= ' '; i++)
204 time_string[i] = timebuf[i];
206 time_string[i] = '\0';
209 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
212 void print_start(FILE *fplog, const t_commrec *cr,
213 gmx_walltime_accounting_t walltime_accounting,
218 sprintf(buf, "Started %s", name);
219 print_date_and_time(fplog, cr->nodeid, buf,
220 walltime_accounting_get_start_time_stamp(walltime_accounting));
223 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
225 const int end = forceToAdd.size();
227 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
228 #pragma omp parallel for num_threads(nt) schedule(static)
229 for (int i = 0; i < end; i++)
231 rvec_inc(f[i], forceToAdd[i]);
235 static void pme_gpu_reduce_outputs(gmx_wallcycle_t wcycle,
236 gmx::ForceWithVirial *forceWithVirial,
237 gmx::ArrayRef<const gmx::RVec> pmeForces,
238 gmx_enerdata_t *enerd,
242 wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
243 GMX_ASSERT(forceWithVirial, "Invalid force pointer");
244 forceWithVirial->addVirialContribution(vir_Q);
245 enerd->term[F_COUL_RECIP] += Vlr_q;
246 sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
247 wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
250 static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
251 tensor vir_part, const t_graph *graph, const matrix box,
252 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
254 /* The short-range virial from surrounding boxes */
255 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
256 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
258 /* Calculate partial virial, for local atoms only, based on short range.
259 * Total virial is computed in global_stat, called from do_md
261 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
262 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
266 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
270 static void pull_potential_wrapper(const t_commrec *cr,
271 const t_inputrec *ir,
272 const matrix box, gmx::ArrayRef<const gmx::RVec> x,
273 gmx::ForceWithVirial *force,
274 const t_mdatoms *mdatoms,
275 gmx_enerdata_t *enerd,
278 gmx_wallcycle_t wcycle)
283 /* Calculate the center of mass forces, this requires communication,
284 * which is why pull_potential is called close to other communication.
286 wallcycle_start(wcycle, ewcPULLPOT);
287 set_pbc(&pbc, ir->ePBC, box);
289 enerd->term[F_COM_PULL] +=
290 pull_potential(ir->pull_work, mdatoms, &pbc,
291 cr, t, lambda[efptRESTRAINT], as_rvec_array(x.data()), force, &dvdl);
292 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
293 wallcycle_stop(wcycle, ewcPULLPOT);
296 static void pme_receive_force_ener(const t_commrec *cr,
297 gmx::ForceWithVirial *forceWithVirial,
298 gmx_enerdata_t *enerd,
299 gmx_wallcycle_t wcycle)
301 real e_q, e_lj, dvdl_q, dvdl_lj;
302 float cycles_ppdpme, cycles_seppme;
304 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
305 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
307 /* In case of node-splitting, the PP nodes receive the long-range
308 * forces, virial and energy from the PME nodes here.
310 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
313 gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
315 enerd->term[F_COUL_RECIP] += e_q;
316 enerd->term[F_LJ_RECIP] += e_lj;
317 enerd->dvdl_lin[efptCOUL] += dvdl_q;
318 enerd->dvdl_lin[efptVDW] += dvdl_lj;
322 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
324 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
327 static void print_large_forces(FILE *fp,
335 real force2Tolerance = gmx::square(forceTolerance);
336 gmx::index numNonFinite = 0;
337 for (int i = 0; i < md->homenr; i++)
339 real force2 = norm2(f[i]);
340 bool nonFinite = !std::isfinite(force2);
341 if (force2 >= force2Tolerance || nonFinite)
343 fprintf(fp, "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
345 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
352 if (numNonFinite > 0)
354 /* Note that with MPI this fatal call on one rank might interrupt
355 * the printing on other ranks. But we can only avoid that with
356 * an expensive MPI barrier that we would need at each step.
358 gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
362 static void post_process_forces(const t_commrec *cr,
365 gmx_wallcycle_t wcycle,
366 const gmx_localtop_t *top,
370 gmx::ForceWithVirial *forceWithVirial,
372 const t_mdatoms *mdatoms,
373 const t_graph *graph,
374 const t_forcerec *fr,
375 const gmx_vsite_t *vsite,
378 if (fr->haveDirectVirialContributions)
380 rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
384 /* Spread the mesh force on virtual sites to the other particles...
385 * This is parallellized. MPI communication is performed
386 * if the constructing atoms aren't local.
388 matrix virial = { { 0 } };
389 spread_vsite_f(vsite, x, fDirectVir, nullptr,
390 (flags & GMX_FORCE_VIRIAL) != 0, virial,
392 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
393 forceWithVirial->addVirialContribution(virial);
396 if (flags & GMX_FORCE_VIRIAL)
398 /* Now add the forces, this is local */
399 sum_forces(f, forceWithVirial->force_);
401 /* Add the direct virial contributions */
402 GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
403 m_add(vir_force, forceWithVirial->getVirial(), vir_force);
407 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
412 if (fr->print_force >= 0)
414 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
418 static void do_nb_verlet(const t_forcerec *fr,
419 const interaction_const_t *ic,
420 gmx_enerdata_t *enerd,
421 int flags, int ilocality,
425 gmx_wallcycle_t wcycle)
427 if (!(flags & GMX_FORCE_NONBONDED))
429 /* skip non-bonded calculation */
433 nonbonded_verlet_t *nbv = fr->nbv;
434 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
436 /* GPU kernel launch overhead is already timed separately */
437 if (fr->cutoff_scheme != ecutsVERLET)
439 gmx_incons("Invalid cut-off scheme passed!");
442 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
444 if (!bUsingGpuKernels)
446 /* When dynamic pair-list pruning is requested, we need to prune
447 * at nstlistPrune steps.
449 if (nbv->listParams->useDynamicPruning &&
450 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
452 /* Prune the pair-list beyond fr->ic->rlistPrune using
453 * the current coordinates of the atoms.
455 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
456 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
457 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
460 wallcycle_sub_start(wcycle, ewcsNONBONDED);
463 switch (nbvg->kernel_type)
465 case nbnxnk4x4_PlainC:
466 case nbnxnk4xN_SIMD_4xN:
467 case nbnxnk4xN_SIMD_2xNN:
468 nbnxn_kernel_cpu(nbvg,
475 enerd->grpp.ener[egCOULSR],
477 enerd->grpp.ener[egBHAMSR] :
478 enerd->grpp.ener[egLJSR]);
481 case nbnxnk8x8x8_GPU:
482 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
485 case nbnxnk8x8x8_PlainC:
486 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
493 enerd->grpp.ener[egCOULSR],
495 enerd->grpp.ener[egBHAMSR] :
496 enerd->grpp.ener[egLJSR]);
500 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
503 if (!bUsingGpuKernels)
505 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
508 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
509 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
511 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
513 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
514 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
516 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
520 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
522 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
523 if (flags & GMX_FORCE_ENERGY)
525 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
526 enr_nbnxn_kernel_ljc += 1;
527 enr_nbnxn_kernel_lj += 1;
530 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
531 nbvg->nbl_lists.natpair_ljq);
532 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
533 nbvg->nbl_lists.natpair_lj);
534 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
535 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
536 nbvg->nbl_lists.natpair_q);
538 if (ic->vdw_modifier == eintmodFORCESWITCH)
540 /* We add up the switch cost separately */
541 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
542 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
544 if (ic->vdw_modifier == eintmodPOTSWITCH)
546 /* We add up the switch cost separately */
547 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
548 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
550 if (ic->vdwtype == evdwPME)
552 /* We add up the LJ Ewald cost separately */
553 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
554 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
558 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
562 const t_mdatoms *mdatoms,
565 gmx_enerdata_t *enerd,
568 gmx_wallcycle_t wcycle)
571 nb_kernel_data_t kernel_data;
573 real dvdl_nb[efptNR];
578 /* Add short-range interactions */
579 donb_flags |= GMX_NONBONDED_DO_SR;
581 /* Currently all group scheme kernels always calculate (shift-)forces */
582 if (flags & GMX_FORCE_FORCES)
584 donb_flags |= GMX_NONBONDED_DO_FORCE;
586 if (flags & GMX_FORCE_VIRIAL)
588 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
590 if (flags & GMX_FORCE_ENERGY)
592 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
595 kernel_data.flags = donb_flags;
596 kernel_data.lambda = lambda;
597 kernel_data.dvdl = dvdl_nb;
599 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
600 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
602 /* reset free energy components */
603 for (i = 0; i < efptNR; i++)
608 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
610 wallcycle_sub_start(wcycle, ewcsNONBONDED);
611 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
612 for (th = 0; th < nbl_lists->nnbl; th++)
616 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
617 x, f, fr, mdatoms, &kernel_data, nrnb);
619 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
622 if (fepvals->sc_alpha != 0)
624 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
625 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
629 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
630 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
633 /* If we do foreign lambda and we have soft-core interactions
634 * we have to recalculate the (non-linear) energies contributions.
636 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
638 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
639 kernel_data.lambda = lam_i;
640 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
641 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
642 /* Note that we add to kernel_data.dvdl, but ignore the result */
644 for (i = 0; i < enerd->n_lambda; i++)
646 for (j = 0; j < efptNR; j++)
648 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
650 reset_foreign_enerdata(enerd);
651 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
652 for (th = 0; th < nbl_lists->nnbl; th++)
656 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
657 x, f, fr, mdatoms, &kernel_data, nrnb);
659 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
662 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
663 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
667 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
670 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
672 return nbv != nullptr && nbv->bUseGPU;
675 static inline void clear_rvecs_omp(int n, rvec v[])
677 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
679 /* Note that we would like to avoid this conditional by putting it
680 * into the omp pragma instead, but then we still take the full
681 * omp parallel for overhead (at least with gcc5).
685 for (int i = 0; i < n; i++)
692 #pragma omp parallel for num_threads(nth) schedule(static)
693 for (int i = 0; i < n; i++)
700 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
702 * \param groupOptions Group options, containing T-coupling options
704 static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
706 real nrdfCoupled = 0;
707 real nrdfUncoupled = 0;
708 real kineticEnergy = 0;
709 for (int g = 0; g < groupOptions.ngtc; g++)
711 if (groupOptions.tau_t[g] >= 0)
713 nrdfCoupled += groupOptions.nrdf[g];
714 kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
718 nrdfUncoupled += groupOptions.nrdf[g];
722 /* This conditional with > also catches nrdf=0 */
723 if (nrdfCoupled > nrdfUncoupled)
725 return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
733 /*! \brief This routine checks that the potential energy is finite.
735 * Always checks that the potential energy is finite. If step equals
736 * inputrec.init_step also checks that the magnitude of the potential energy
737 * is reasonable. Terminates with a fatal error when a check fails.
738 * Note that passing this check does not guarantee finite forces,
739 * since those use slightly different arithmetics. But in most cases
740 * there is just a narrow coordinate range where forces are not finite
741 * and energies are finite.
743 * \param[in] step The step number, used for checking and printing
744 * \param[in] enerd The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
745 * \param[in] inputrec The input record
747 static void checkPotentialEnergyValidity(int64_t step,
748 const gmx_enerdata_t &enerd,
749 const t_inputrec &inputrec)
751 /* Threshold valid for comparing absolute potential energy against
752 * the kinetic energy. Normally one should not consider absolute
753 * potential energy values, but with a factor of one million
754 * we should never get false positives.
756 constexpr real c_thresholdFactor = 1e6;
758 bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
759 real averageKineticEnergy = 0;
760 /* We only check for large potential energy at the initial step,
761 * because that is by far the most likely step for this too occur
762 * and because computing the average kinetic energy is not free.
763 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
764 * before they become NaN.
766 if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
768 averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
771 if (energyIsNotFinite || (averageKineticEnergy > 0 &&
772 enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
774 gmx_fatal(FARGS, "Step %" PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
777 energyIsNotFinite ? "not finite" : "extremely high",
779 enerd.term[F_COUL_SR],
780 energyIsNotFinite ? "non-finite" : "very high",
781 energyIsNotFinite ? " or Nan" : "");
785 /*! \brief Compute forces and/or energies for special algorithms
787 * The intention is to collect all calls to algorithms that compute
788 * forces on local atoms only and that do not contribute to the local
789 * virial sum (but add their virial contribution separately).
790 * Eventually these should likely all become ForceProviders.
791 * Within this function the intention is to have algorithms that do
792 * global communication at the end, so global barriers within the MD loop
793 * are as close together as possible.
795 * \param[in] fplog The log file
796 * \param[in] cr The communication record
797 * \param[in] inputrec The input record
798 * \param[in] awh The Awh module (nullptr if none in use).
799 * \param[in] enforcedRotation Enforced rotation module.
800 * \param[in] step The current MD step
801 * \param[in] t The current time
802 * \param[in,out] wcycle Wallcycle accounting struct
803 * \param[in,out] forceProviders Pointer to a list of force providers
804 * \param[in] box The unit cell
805 * \param[in] x The coordinates
806 * \param[in] mdatoms Per atom properties
807 * \param[in] lambda Array of free-energy lambda values
808 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
809 * \param[in,out] forceWithVirial Force and virial buffers
810 * \param[in,out] enerd Energy buffer
811 * \param[in,out] ed Essential dynamics pointer
812 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
814 * \todo Remove bNS, which is used incorrectly.
815 * \todo Convert all other algorithms called here to ForceProviders.
818 computeSpecialForces(FILE *fplog,
820 const t_inputrec *inputrec,
822 gmx_enfrot *enforcedRotation,
825 gmx_wallcycle_t wcycle,
826 ForceProviders *forceProviders,
828 gmx::ArrayRef<const gmx::RVec> x,
829 const t_mdatoms *mdatoms,
832 gmx::ForceWithVirial *forceWithVirial,
833 gmx_enerdata_t *enerd,
837 const bool computeForces = (forceFlags & GMX_FORCE_FORCES) != 0;
839 /* NOTE: Currently all ForceProviders only provide forces.
840 * When they also provide energies, remove this conditional.
844 gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
845 gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
847 /* Collect forces from modules */
848 forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
851 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
853 pull_potential_wrapper(cr, inputrec, box, x,
855 mdatoms, enerd, lambda, t,
860 enerd->term[F_COM_PULL] +=
861 awh->applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
863 t, step, wcycle, fplog);
867 rvec *f = as_rvec_array(forceWithVirial->force_.data());
869 /* Add the forces from enforced rotation potentials (if any) */
872 wallcycle_start(wcycle, ewcROTadd);
873 enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
874 wallcycle_stop(wcycle, ewcROTadd);
879 /* Note that since init_edsam() is called after the initialization
880 * of forcerec, edsam doesn't request the noVirSum force buffer.
881 * Thus if no other algorithm (e.g. PME) requires it, the forces
882 * here will contribute to the virial.
884 do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, bNS);
887 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
888 if (inputrec->bIMD && computeForces)
890 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
894 /*! \brief Launch the prepare_step and spread stages of PME GPU.
896 * \param[in] pmedata The PME structure
897 * \param[in] box The box matrix
898 * \param[in] x Coordinate array
899 * \param[in] flags Force flags
900 * \param[in] wcycle The wallcycle structure
902 static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
906 gmx_wallcycle_t wcycle)
908 int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
909 pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
910 pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
912 pme_gpu_prepare_computation(pmedata, (flags & GMX_FORCE_DYNAMICBOX) != 0, box, wcycle, pmeFlags);
913 pme_gpu_launch_spread(pmedata, x, wcycle);
916 /*! \brief Launch the FFT and gather stages of PME GPU
918 * This function only implements setting the output forces (no accumulation).
920 * \param[in] pmedata The PME structure
921 * \param[in] wcycle The wallcycle structure
923 static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
924 gmx_wallcycle_t wcycle)
926 pme_gpu_launch_complex_transforms(pmedata, wcycle);
927 pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
931 * Polling wait for either of the PME or nonbonded GPU tasks.
933 * Instead of a static order in waiting for GPU tasks, this function
934 * polls checking which of the two tasks completes first, and does the
935 * associated force buffer reduction overlapped with the other task.
936 * By doing that, unlike static scheduling order, it can always overlap
937 * one of the reductions, regardless of the GPU task completion order.
939 * \param[in] nbv Nonbonded verlet structure
940 * \param[in] pmedata PME module data
941 * \param[in,out] force Force array to reduce task outputs into.
942 * \param[in,out] forceWithVirial Force and virial buffers
943 * \param[in,out] fshift Shift force output vector results are reduced into
944 * \param[in,out] enerd Energy data structure results are reduced into
945 * \param[in] flags Force flags
946 * \param[in] wcycle The wallcycle structure
948 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
949 const gmx_pme_t *pmedata,
950 gmx::PaddedArrayRef<gmx::RVec> *force,
951 gmx::ForceWithVirial *forceWithVirial,
953 gmx_enerdata_t *enerd,
955 gmx_wallcycle_t wcycle)
957 bool isPmeGpuDone = false;
958 bool isNbGpuDone = false;
961 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
963 while (!isPmeGpuDone || !isNbGpuDone)
970 GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
971 isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
972 vir_Q, &Vlr_q, completionType);
976 pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
977 enerd, vir_Q, Vlr_q);
983 GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
984 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
985 isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
987 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
988 fshift, completionType);
989 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
990 // To get the call count right, when the task finished we
991 // issue a start/stop.
992 // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
993 // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
996 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
997 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
999 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1000 nbv->nbat, as_rvec_array(force->data()), wcycle);
1007 * Launch the dynamic rolling pruning GPU task.
1009 * We currently alternate local/non-local list pruning in odd-even steps
1010 * (only pruning every second step without DD).
1012 * \param[in] cr The communication record
1013 * \param[in] nbv Nonbonded verlet structure
1014 * \param[in] inputrec The input record
1015 * \param[in] step The current MD step
1017 static inline void launchGpuRollingPruning(const t_commrec *cr,
1018 const nonbonded_verlet_t *nbv,
1019 const t_inputrec *inputrec,
1022 /* We should not launch the rolling pruning kernel at a search
1023 * step or just before search steps, since that's useless.
1024 * Without domain decomposition we prune at even steps.
1025 * With domain decomposition we alternate local and non-local
1026 * pruning at even and odd steps.
1028 int numRollingParts = nbv->listParams->numRollingParts;
1029 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1030 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1031 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1032 if (stepWithCurrentList > 0 &&
1033 stepWithCurrentList < inputrec->nstlist - 1 &&
1034 (stepIsEven || DOMAINDECOMP(cr)))
1036 nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
1037 stepIsEven ? eintLocal : eintNonlocal,
1042 static void do_force_cutsVERLET(FILE *fplog,
1043 const t_commrec *cr,
1044 const gmx_multisim_t *ms,
1045 const t_inputrec *inputrec,
1047 gmx_enfrot *enforcedRotation,
1050 gmx_wallcycle_t wcycle,
1051 const gmx_localtop_t *top,
1052 const gmx_groups_t * /* groups */,
1053 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1055 gmx::PaddedArrayRef<gmx::RVec> force,
1057 const t_mdatoms *mdatoms,
1058 gmx_enerdata_t *enerd, t_fcdata *fcd,
1062 interaction_const_t *ic,
1063 const gmx_vsite_t *vsite,
1068 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1069 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1073 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1074 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
1075 rvec vzero, box_diag;
1076 float cycles_pme, cycles_wait_gpu;
1077 nonbonded_verlet_t *nbv = fr->nbv;
1079 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1080 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1081 bFillGrid = (bNS && bStateChanged);
1082 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1083 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1084 bUseGPU = fr->nbv->bUseGPU;
1085 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
1087 const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
1088 // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
1089 const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
1090 ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
1092 /* At a search step we need to start the first balancing region
1093 * somewhere early inside the step after communication during domain
1094 * decomposition (and not during the previous step as usual).
1097 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1099 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
1102 cycles_wait_gpu = 0;
1104 const int start = 0;
1105 const int homenr = mdatoms->homenr;
1107 clear_mat(vir_force);
1109 if (DOMAINDECOMP(cr))
1111 cg1 = cr->dd->globalAtomGroupIndices.size();
1124 update_forcerec(fr, box);
1126 if (inputrecNeedMutot(inputrec))
1128 /* Calculate total (local) dipole moment in a temporary common array.
1129 * This makes it possible to sum them over nodes faster.
1131 calc_mu(start, homenr,
1132 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1137 if (fr->ePBC != epbcNONE)
1139 /* Compute shift vectors every step,
1140 * because of pressure coupling or box deformation!
1142 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1144 calc_shifts(box, fr->shift_vec);
1149 put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
1150 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
1152 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1154 unshift_self(graph, box, as_rvec_array(x.data()));
1158 nbnxn_atomdata_copy_shiftvec((flags & GMX_FORCE_DYNAMICBOX) != 0,
1159 fr->shift_vec, nbv->nbat);
1162 if (!thisRankHasDuty(cr, DUTY_PME))
1164 /* Send particle coordinates to the pme nodes.
1165 * Since this is only implemented for domain decomposition
1166 * and domain decomposition does not use the graph,
1167 * we do not need to worry about shifting.
1169 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1170 lambda[efptCOUL], lambda[efptVDW],
1171 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1174 #endif /* GMX_MPI */
1178 launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
1181 /* do gridding for pair search */
1184 if (graph && bStateChanged)
1186 /* Calculate intramolecular shift vectors to make molecules whole */
1187 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1191 box_diag[XX] = box[XX][XX];
1192 box_diag[YY] = box[YY][YY];
1193 box_diag[ZZ] = box[ZZ][ZZ];
1195 wallcycle_start(wcycle, ewcNS);
1196 if (!DOMAINDECOMP(cr))
1198 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
1199 nbnxn_put_on_grid(nbv->nbs.get(), fr->ePBC, box,
1201 0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
1203 nbv->grp[eintLocal].kernel_type,
1205 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
1209 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
1210 nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
1211 fr->cginfo, as_rvec_array(x.data()),
1212 nbv->grp[eintNonlocal].kernel_type,
1214 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
1217 nbnxn_atomdata_set(nbv->nbat, nbv->nbs.get(), mdatoms, fr->cginfo);
1218 wallcycle_stop(wcycle, ewcNS);
1221 /* initialize the GPU atom data and copy shift vector */
1224 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1225 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1229 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
1232 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
1234 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1235 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1238 /* do local pair search */
1241 wallcycle_start_nocount(wcycle, ewcNS);
1242 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
1243 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1245 nbv->listParams->rlistOuter,
1246 nbv->min_ci_balanced,
1247 &nbv->grp[eintLocal].nbl_lists,
1249 nbv->grp[eintLocal].kernel_type,
1251 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
1252 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1254 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1256 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1260 /* initialize local pair-list on the GPU */
1261 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1262 nbv->grp[eintLocal].nbl_lists.nbl[0],
1265 wallcycle_stop(wcycle, ewcNS);
1269 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.data()),
1275 if (DOMAINDECOMP(cr))
1277 ddOpenBalanceRegionGpu(cr->dd);
1280 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1281 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1282 /* launch local nonbonded work on GPU */
1283 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1284 step, nrnb, wcycle);
1285 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1286 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1291 // In PME GPU and mixed mode we launch FFT / gather after the
1292 // X copy/transform to allow overlap as well as after the GPU NB
1293 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1294 // the nonbonded kernel.
1295 launchPmeGpuFftAndGather(fr->pmedata, wcycle);
1298 /* Communicate coordinates and sum dipole if necessary +
1299 do non-local pair search */
1300 if (DOMAINDECOMP(cr))
1304 wallcycle_start_nocount(wcycle, ewcNS);
1305 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1307 nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
1309 nbv->listParams->rlistOuter,
1310 nbv->min_ci_balanced,
1311 &nbv->grp[eintNonlocal].nbl_lists,
1313 nbv->grp[eintNonlocal].kernel_type,
1315 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1316 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1318 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1320 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1322 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1324 /* initialize non-local pair-list on the GPU */
1325 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1326 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1329 wallcycle_stop(wcycle, ewcNS);
1333 dd_move_x(cr->dd, box, x, wcycle);
1335 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.data()),
1341 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1342 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1343 /* launch non-local nonbonded tasks on GPU */
1344 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1345 step, nrnb, wcycle);
1346 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1347 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1353 /* launch D2H copy-back F */
1354 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1355 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1356 if (DOMAINDECOMP(cr))
1358 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1359 flags, eatNonlocal);
1361 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1363 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1364 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1367 if (bStateChanged && inputrecNeedMutot(inputrec))
1371 gmx_sumd(2*DIM, mu, cr);
1372 ddReopenBalanceRegionCpu(cr->dd);
1375 for (i = 0; i < 2; i++)
1377 for (j = 0; j < DIM; j++)
1379 fr->mu_tot[i][j] = mu[i*DIM + j];
1383 if (fr->efep == efepNO)
1385 copy_rvec(fr->mu_tot[0], mu_tot);
1389 for (j = 0; j < DIM; j++)
1392 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1393 lambda[efptCOUL]*fr->mu_tot[1][j];
1397 /* Reset energies */
1398 reset_enerdata(enerd);
1399 clear_rvecs(SHIFTS, fr->fshift);
1401 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1403 wallcycle_start(wcycle, ewcPPDURINGPME);
1404 dd_force_flop_start(cr->dd, nrnb);
1409 wallcycle_start(wcycle, ewcROT);
1410 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1411 wallcycle_stop(wcycle, ewcROT);
1414 /* Temporary solution until all routines take PaddedRVecVector */
1415 rvec *const f = as_rvec_array(force.data());
1417 /* Start the force cycle counter.
1418 * Note that a different counter is used for dynamic load balancing.
1420 wallcycle_start(wcycle, ewcFORCE);
1422 gmx::ArrayRef<gmx::RVec> forceRef = force;
1425 /* If we need to compute the virial, we might need a separate
1426 * force buffer for algorithms for which the virial is calculated
1427 * directly, such as PME.
1429 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1431 forceRef = *fr->forceBufferForDirectVirialContributions;
1433 /* TODO: update comment
1434 * We only compute forces on local atoms. Note that vsites can
1435 * spread to non-local atoms, but that part of the buffer is
1436 * cleared separately in the vsite spreading code.
1438 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1440 /* Clear the short- and long-range forces */
1441 clear_rvecs_omp(fr->natoms_force_constr, f);
1444 /* forceWithVirial uses the local atom range only */
1445 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1447 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1449 clear_pull_forces(inputrec->pull_work);
1452 /* We calculate the non-bonded forces, when done on the CPU, here.
1453 * We do this before calling do_force_lowlevel, because in that
1454 * function, the listed forces are calculated before PME, which
1455 * does communication. With this order, non-bonded and listed
1456 * force calculation imbalance can be balanced out by the domain
1457 * decomposition load balancing.
1462 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1463 step, nrnb, wcycle);
1466 if (fr->efep != efepNO)
1468 /* Calculate the local and non-local free energy interactions here.
1469 * Happens here on the CPU both with and without GPU.
1471 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1473 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1474 fr, as_rvec_array(x.data()), f, mdatoms,
1475 inputrec->fepvals, lambda,
1476 enerd, flags, nrnb, wcycle);
1479 if (DOMAINDECOMP(cr) &&
1480 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1482 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1483 fr, as_rvec_array(x.data()), f, mdatoms,
1484 inputrec->fepvals, lambda,
1485 enerd, flags, nrnb, wcycle);
1493 if (DOMAINDECOMP(cr))
1495 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1496 step, nrnb, wcycle);
1505 aloc = eintNonlocal;
1508 /* Add all the non-bonded force to the normal force array.
1509 * This can be split into a local and a non-local part when overlapping
1510 * communication with calculation with domain decomposition.
1512 wallcycle_stop(wcycle, ewcFORCE);
1514 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
1516 wallcycle_start_nocount(wcycle, ewcFORCE);
1518 /* if there are multiple fshift output buffers reduce them */
1519 if ((flags & GMX_FORCE_VIRIAL) &&
1520 nbv->grp[aloc].nbl_lists.nnbl > 1)
1522 /* This is not in a subcounter because it takes a
1523 negligible and constant-sized amount of time */
1524 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1529 /* update QMMMrec, if necessary */
1532 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1535 /* Compute the bonded and non-bonded energies and optionally forces */
1536 do_force_lowlevel(fr, inputrec, &(top->idef),
1537 cr, ms, nrnb, wcycle, mdatoms,
1538 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1539 box, inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1540 flags, &cycles_pme);
1542 wallcycle_stop(wcycle, ewcFORCE);
1544 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
1546 fr->forceProviders, box, x, mdatoms, lambda,
1547 flags, &forceWithVirial, enerd,
1552 /* wait for non-local forces (or calculate in emulation mode) */
1553 if (DOMAINDECOMP(cr))
1557 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1558 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1560 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1562 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1566 wallcycle_start_nocount(wcycle, ewcFORCE);
1567 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1568 step, nrnb, wcycle);
1569 wallcycle_stop(wcycle, ewcFORCE);
1572 /* skip the reduction if there was no non-local work to do */
1573 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1575 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
1576 nbv->nbat, f, wcycle);
1581 if (DOMAINDECOMP(cr))
1583 /* We are done with the CPU compute.
1584 * We will now communicate the non-local forces.
1585 * If we use a GPU this will overlap with GPU work, so in that case
1586 * we do not close the DD force balancing region here.
1588 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1590 ddCloseBalanceRegionCpu(cr->dd);
1594 dd_move_f(cr->dd, force, fr->fshift, wcycle);
1598 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1599 // an alternating wait/reduction scheme.
1600 bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
1601 if (alternateGpuWait)
1603 alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
1606 if (!alternateGpuWait && useGpuPme)
1608 gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
1611 pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
1612 pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
1615 /* Wait for local GPU NB outputs on the non-alternating wait path */
1616 if (!alternateGpuWait && bUseGPU)
1618 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1619 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1620 * but even with a step of 0.1 ms the difference is less than 1%
1623 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1625 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1626 nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
1628 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1630 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1632 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1634 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1635 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1637 /* We measured few cycles, it could be that the kernel
1638 * and transfer finished earlier and there was no actual
1639 * wait time, only API call overhead.
1640 * Then the actual time could be anywhere between 0 and
1641 * cycles_wait_est. We will use half of cycles_wait_est.
1643 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1645 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1649 if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
1651 // NOTE: emulation kernel is not included in the balancing region,
1652 // but emulation mode does not target performance anyway
1653 wallcycle_start_nocount(wcycle, ewcFORCE);
1654 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1655 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1656 step, nrnb, wcycle);
1657 wallcycle_stop(wcycle, ewcFORCE);
1662 pme_gpu_reinit_computation(fr->pmedata, wcycle);
1667 /* now clear the GPU outputs while we finish the step on the CPU */
1668 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1669 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1670 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1672 /* Is dynamic pair-list pruning activated? */
1673 if (nbv->listParams->useDynamicPruning)
1675 launchGpuRollingPruning(cr, nbv, inputrec, step);
1677 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1678 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1681 /* Do the nonbonded GPU (or emulation) force buffer reduction
1682 * on the non-alternating path. */
1683 if (bUseOrEmulGPU && !alternateGpuWait)
1685 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
1686 nbv->nbat, f, wcycle);
1689 if (DOMAINDECOMP(cr))
1691 dd_force_flop_stop(cr->dd, nrnb);
1696 /* If we have NoVirSum forces, but we do not calculate the virial,
1697 * we sum fr->f_novirsum=f later.
1699 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
1701 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
1702 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
1705 if (flags & GMX_FORCE_VIRIAL)
1707 /* Calculation of the virial must be done after vsites! */
1708 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
1709 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1713 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
1715 /* In case of node-splitting, the PP nodes receive the long-range
1716 * forces, virial and energy from the PME nodes here.
1718 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
1723 post_process_forces(cr, step, nrnb, wcycle,
1724 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
1725 vir_force, mdatoms, graph, fr, vsite,
1729 if (flags & GMX_FORCE_ENERGY)
1731 /* Sum the potential energy terms from group contributions */
1732 sum_epot(&(enerd->grpp), enerd->term);
1734 if (!EI_TPI(inputrec->eI))
1736 checkPotentialEnergyValidity(step, *enerd, *inputrec);
1741 static void do_force_cutsGROUP(FILE *fplog,
1742 const t_commrec *cr,
1743 const gmx_multisim_t *ms,
1744 const t_inputrec *inputrec,
1746 gmx_enfrot *enforcedRotation,
1749 gmx_wallcycle_t wcycle,
1750 gmx_localtop_t *top,
1751 const gmx_groups_t *groups,
1752 matrix box, gmx::PaddedArrayRef<gmx::RVec> x,
1754 gmx::PaddedArrayRef<gmx::RVec> force,
1756 const t_mdatoms *mdatoms,
1757 gmx_enerdata_t *enerd,
1762 const gmx_vsite_t *vsite,
1767 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1768 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1772 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1776 const int start = 0;
1777 const int homenr = mdatoms->homenr;
1779 clear_mat(vir_force);
1782 if (DOMAINDECOMP(cr))
1784 cg1 = cr->dd->globalAtomGroupIndices.size();
1795 bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
1796 bNS = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
1797 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1798 bFillGrid = (bNS && bStateChanged);
1799 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1800 bDoForces = ((flags & GMX_FORCE_FORCES) != 0);
1804 update_forcerec(fr, box);
1806 if (inputrecNeedMutot(inputrec))
1808 /* Calculate total (local) dipole moment in a temporary common array.
1809 * This makes it possible to sum them over nodes faster.
1811 calc_mu(start, homenr,
1812 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1817 if (fr->ePBC != epbcNONE)
1819 /* Compute shift vectors every step,
1820 * because of pressure coupling or box deformation!
1822 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1824 calc_shifts(box, fr->shift_vec);
1829 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1830 &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1831 inc_nrnb(nrnb, eNR_CGCM, homenr);
1832 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1834 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1836 unshift_self(graph, box, as_rvec_array(x.data()));
1841 calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
1842 inc_nrnb(nrnb, eNR_CGCM, homenr);
1845 if (bCalcCGCM && gmx_debug_at)
1847 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1851 if (!thisRankHasDuty(cr, DUTY_PME))
1853 /* Send particle coordinates to the pme nodes.
1854 * Since this is only implemented for domain decomposition
1855 * and domain decomposition does not use the graph,
1856 * we do not need to worry about shifting.
1858 gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
1859 lambda[efptCOUL], lambda[efptVDW],
1860 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) != 0,
1863 #endif /* GMX_MPI */
1865 /* Communicate coordinates and sum dipole if necessary */
1866 if (DOMAINDECOMP(cr))
1868 dd_move_x(cr->dd, box, x, wcycle);
1870 /* No GPU support, no move_x overlap, so reopen the balance region here */
1871 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1873 ddReopenBalanceRegionCpu(cr->dd);
1877 if (inputrecNeedMutot(inputrec))
1883 gmx_sumd(2*DIM, mu, cr);
1884 ddReopenBalanceRegionCpu(cr->dd);
1886 for (i = 0; i < 2; i++)
1888 for (j = 0; j < DIM; j++)
1890 fr->mu_tot[i][j] = mu[i*DIM + j];
1894 if (fr->efep == efepNO)
1896 copy_rvec(fr->mu_tot[0], mu_tot);
1900 for (j = 0; j < DIM; j++)
1903 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1908 /* Reset energies */
1909 reset_enerdata(enerd);
1910 clear_rvecs(SHIFTS, fr->fshift);
1914 wallcycle_start(wcycle, ewcNS);
1916 if (graph && bStateChanged)
1918 /* Calculate intramolecular shift vectors to make molecules whole */
1919 mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
1922 /* Do the actual neighbour searching */
1924 groups, top, mdatoms,
1925 cr, nrnb, bFillGrid);
1927 wallcycle_stop(wcycle, ewcNS);
1930 if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
1932 wallcycle_start(wcycle, ewcPPDURINGPME);
1933 dd_force_flop_start(cr->dd, nrnb);
1938 wallcycle_start(wcycle, ewcROT);
1939 do_rotation(cr, enforcedRotation, box, as_rvec_array(x.data()), t, step, bNS);
1940 wallcycle_stop(wcycle, ewcROT);
1943 /* Temporary solution until all routines take PaddedRVecVector */
1944 rvec *f = as_rvec_array(force.data());
1946 /* Start the force cycle counter.
1947 * Note that a different counter is used for dynamic load balancing.
1949 wallcycle_start(wcycle, ewcFORCE);
1951 gmx::ArrayRef<gmx::RVec> forceRef = force;
1954 /* If we need to compute the virial, we might need a separate
1955 * force buffer for algorithms for which the virial is calculated
1956 * separately, such as PME.
1958 if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
1960 forceRef = *fr->forceBufferForDirectVirialContributions;
1962 clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
1965 /* Clear the short- and long-range forces */
1966 clear_rvecs(fr->natoms_force_constr, f);
1969 /* forceWithVirial might need the full force atom range */
1970 gmx::ForceWithVirial forceWithVirial(forceRef, (flags & GMX_FORCE_VIRIAL) != 0);
1972 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1974 clear_pull_forces(inputrec->pull_work);
1977 /* update QMMMrec, if necessary */
1980 update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
1983 /* Compute the bonded and non-bonded energies and optionally forces */
1984 do_force_lowlevel(fr, inputrec, &(top->idef),
1985 cr, ms, nrnb, wcycle, mdatoms,
1986 as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd,
1987 box, inputrec->fepvals, lambda,
1988 graph, &(top->excls), fr->mu_tot,
1992 wallcycle_stop(wcycle, ewcFORCE);
1994 if (DOMAINDECOMP(cr))
1996 dd_force_flop_stop(cr->dd, nrnb);
1998 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
2000 ddCloseBalanceRegionCpu(cr->dd);
2004 computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
2006 fr->forceProviders, box, x, mdatoms, lambda,
2007 flags, &forceWithVirial, enerd,
2012 /* Communicate the forces */
2013 if (DOMAINDECOMP(cr))
2015 dd_move_f(cr->dd, force, fr->fshift, wcycle);
2016 /* Do we need to communicate the separate force array
2017 * for terms that do not contribute to the single sum virial?
2018 * Position restraints and electric fields do not introduce
2019 * inter-cg forces, only full electrostatics methods do.
2020 * When we do not calculate the virial, fr->f_novirsum = f,
2021 * so we have already communicated these forces.
2023 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
2024 (flags & GMX_FORCE_VIRIAL))
2026 dd_move_f(cr->dd, forceWithVirial.force_, nullptr, wcycle);
2030 /* If we have NoVirSum forces, but we do not calculate the virial,
2031 * we sum fr->f_novirsum=f later.
2033 if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
2035 spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
2036 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
2039 if (flags & GMX_FORCE_VIRIAL)
2041 /* Calculation of the virial must be done after vsites! */
2042 calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
2043 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
2047 if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
2049 /* In case of node-splitting, the PP nodes receive the long-range
2050 * forces, virial and energy from the PME nodes here.
2052 pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
2057 post_process_forces(cr, step, nrnb, wcycle,
2058 top, box, as_rvec_array(x.data()), f, &forceWithVirial,
2059 vir_force, mdatoms, graph, fr, vsite,
2063 if (flags & GMX_FORCE_ENERGY)
2065 /* Sum the potential energy terms from group contributions */
2066 sum_epot(&(enerd->grpp), enerd->term);
2068 if (!EI_TPI(inputrec->eI))
2070 checkPotentialEnergyValidity(step, *enerd, *inputrec);
2076 void do_force(FILE *fplog,
2077 const t_commrec *cr,
2078 const gmx_multisim_t *ms,
2079 const t_inputrec *inputrec,
2081 gmx_enfrot *enforcedRotation,
2084 gmx_wallcycle_t wcycle,
2085 gmx_localtop_t *top,
2086 const gmx_groups_t *groups,
2088 gmx::PaddedArrayRef<gmx::RVec> x,
2090 gmx::PaddedArrayRef<gmx::RVec> force,
2092 const t_mdatoms *mdatoms,
2093 gmx_enerdata_t *enerd,
2095 gmx::ArrayRef<real> lambda,
2098 const gmx_vsite_t *vsite,
2103 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
2104 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
2106 /* modify force flag if not doing nonbonded */
2107 if (!fr->bNonbonded)
2109 flags &= ~GMX_FORCE_NONBONDED;
2112 GMX_ASSERT(x.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
2113 GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
2115 switch (inputrec->cutoff_scheme)
2118 do_force_cutsVERLET(fplog, cr, ms, inputrec,
2119 awh, enforcedRotation, step, nrnb, wcycle,
2126 lambda.data(), graph,
2131 ddOpenBalanceRegion,
2132 ddCloseBalanceRegion);
2135 do_force_cutsGROUP(fplog, cr, ms, inputrec,
2136 awh, enforcedRotation, step, nrnb, wcycle,
2143 lambda.data(), graph,
2147 ddOpenBalanceRegion,
2148 ddCloseBalanceRegion);
2151 gmx_incons("Invalid cut-off scheme passed!");
2154 /* In case we don't have constraints and are using GPUs, the next balancing
2155 * region starts here.
2156 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2157 * virial calculation and COM pulling, is not thus not included in
2158 * the balance timing, which is ok as most tasks do communication.
2160 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
2162 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
2167 void do_constrain_first(FILE *fplog, gmx::Constraints *constr,
2168 const t_inputrec *ir, const t_mdatoms *md,
2171 int i, m, start, end;
2173 real dt = ir->delta_t;
2177 /* We need to allocate one element extra, since we might use
2178 * (unaligned) 4-wide SIMD loads to access rvec entries.
2180 snew(savex, state->natoms + 1);
2187 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
2188 start, md->homenr, end);
2190 /* Do a first constrain to reset particles... */
2191 step = ir->init_step;
2194 char buf[STEPSTRSIZE];
2195 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
2196 gmx_step_str(step, buf));
2200 /* constrain the current position */
2201 constr->apply(TRUE, FALSE,
2203 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
2205 state->lambda[efptBONDED], &dvdl_dum,
2206 nullptr, nullptr, gmx::ConstraintVariable::Positions);
2209 /* constrain the inital velocity, and save it */
2210 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
2211 constr->apply(TRUE, FALSE,
2213 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
2215 state->lambda[efptBONDED], &dvdl_dum,
2216 nullptr, nullptr, gmx::ConstraintVariable::Velocities);
2218 /* constrain the inital velocities at t-dt/2 */
2219 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
2221 for (i = start; (i < end); i++)
2223 for (m = 0; (m < DIM); m++)
2225 /* Reverse the velocity */
2226 state->v[i][m] = -state->v[i][m];
2227 /* Store the position at t-dt in buf */
2228 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2231 /* Shake the positions at t=-dt with the positions at t=0
2232 * as reference coordinates.
2236 char buf[STEPSTRSIZE];
2237 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2238 gmx_step_str(step, buf));
2241 constr->apply(TRUE, FALSE,
2243 as_rvec_array(state->x.data()), savex, nullptr,
2245 state->lambda[efptBONDED], &dvdl_dum,
2246 as_rvec_array(state->v.data()), nullptr, gmx::ConstraintVariable::Positions);
2248 for (i = start; i < end; i++)
2250 for (m = 0; m < DIM; m++)
2252 /* Re-reverse the velocities */
2253 state->v[i][m] = -state->v[i][m];
2262 integrate_table(const real vdwtab[], real scale, int offstart, int rstart, int rend,
2263 double *enerout, double *virout)
2265 double enersum, virsum;
2266 double invscale, invscale2, invscale3;
2267 double r, ea, eb, ec, pa, pb, pc, pd;
2272 invscale = 1.0/scale;
2273 invscale2 = invscale*invscale;
2274 invscale3 = invscale*invscale2;
2276 /* Following summation derived from cubic spline definition,
2277 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2278 * the cubic spline. We first calculate the negative of the
2279 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2280 * add the more standard, abrupt cutoff correction to that result,
2281 * yielding the long-range correction for a switched function. We
2282 * perform both the pressure and energy loops at the same time for
2283 * simplicity, as the computational cost is low. */
2287 /* Since the dispersion table has been scaled down a factor
2288 * 6.0 and the repulsion a factor 12.0 to compensate for the
2289 * c6/c12 parameters inside nbfp[] being scaled up (to save
2290 * flops in kernels), we need to correct for this.
2301 for (ri = rstart; ri < rend; ++ri)
2305 eb = 2.0*invscale2*r;
2309 pb = 3.0*invscale2*r;
2310 pc = 3.0*invscale*r*r;
2313 /* this "8" is from the packing in the vdwtab array - perhaps
2314 should be defined? */
2316 offset = 8*ri + offstart;
2317 y0 = vdwtab[offset];
2318 f = vdwtab[offset+1];
2319 g = vdwtab[offset+2];
2320 h = vdwtab[offset+3];
2322 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2323 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2325 *enerout = 4.0*M_PI*enersum*tabfactor;
2326 *virout = 4.0*M_PI*virsum*tabfactor;
2329 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2331 double eners[2], virs[2], enersum, virsum;
2332 double r0, rc3, rc9;
2334 real scale, *vdwtab;
2336 fr->enershiftsix = 0;
2337 fr->enershifttwelve = 0;
2338 fr->enerdiffsix = 0;
2339 fr->enerdifftwelve = 0;
2341 fr->virdifftwelve = 0;
2343 const interaction_const_t *ic = fr->ic;
2345 if (eDispCorr != edispcNO)
2347 for (i = 0; i < 2; i++)
2352 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2353 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2354 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2355 (ic->vdwtype == evdwSHIFT) ||
2356 (ic->vdwtype == evdwSWITCH))
2358 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2359 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2360 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2363 "With dispersion correction rvdw-switch can not be zero "
2364 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2367 /* TODO This code depends on the logic in tables.c that
2368 constructs the table layout, which should be made
2369 explicit in future cleanup. */
2370 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2371 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2372 scale = fr->dispersionCorrectionTable->scale;
2373 vdwtab = fr->dispersionCorrectionTable->data;
2375 /* Round the cut-offs to exact table values for precision */
2376 ri0 = static_cast<int>(std::floor(ic->rvdw_switch*scale));
2377 ri1 = static_cast<int>(std::ceil(ic->rvdw*scale));
2379 /* The code below has some support for handling force-switching, i.e.
2380 * when the force (instead of potential) is switched over a limited
2381 * region. This leads to a constant shift in the potential inside the
2382 * switching region, which we can handle by adding a constant energy
2383 * term in the force-switch case just like when we do potential-shift.
2385 * For now this is not enabled, but to keep the functionality in the
2386 * code we check separately for switch and shift. When we do force-switch
2387 * the shifting point is rvdw_switch, while it is the cutoff when we
2388 * have a classical potential-shift.
2390 * For a pure potential-shift the potential has a constant shift
2391 * all the way out to the cutoff, and that is it. For other forms
2392 * we need to calculate the constant shift up to the point where we
2393 * start modifying the potential.
2395 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2401 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2402 (ic->vdwtype == evdwSHIFT))
2404 /* Determine the constant energy shift below rvdw_switch.
2405 * Table has a scale factor since we have scaled it down to compensate
2406 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2408 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2409 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2411 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2413 fr->enershiftsix = static_cast<real>(-1.0/(rc3*rc3));
2414 fr->enershifttwelve = static_cast<real>( 1.0/(rc9*rc3));
2417 /* Add the constant part from 0 to rvdw_switch.
2418 * This integration from 0 to rvdw_switch overcounts the number
2419 * of interactions by 1, as it also counts the self interaction.
2420 * We will correct for this later.
2422 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2423 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2425 /* Calculate the contribution in the range [r0,r1] where we
2426 * modify the potential. For a pure potential-shift modifier we will
2427 * have ri0==ri1, and there will not be any contribution here.
2429 for (i = 0; i < 2; i++)
2433 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2434 eners[i] -= enersum;
2438 /* Alright: Above we compensated by REMOVING the parts outside r0
2439 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2441 * Regardless of whether r0 is the point where we start switching,
2442 * or the cutoff where we calculated the constant shift, we include
2443 * all the parts we are missing out to infinity from r0 by
2444 * calculating the analytical dispersion correction.
2446 eners[0] += -4.0*M_PI/(3.0*rc3);
2447 eners[1] += 4.0*M_PI/(9.0*rc9);
2448 virs[0] += 8.0*M_PI/rc3;
2449 virs[1] += -16.0*M_PI/(3.0*rc9);
2451 else if (ic->vdwtype == evdwCUT ||
2452 EVDW_PME(ic->vdwtype) ||
2453 ic->vdwtype == evdwUSER)
2455 if (ic->vdwtype == evdwUSER && fplog)
2458 "WARNING: using dispersion correction with user tables\n");
2461 /* Note that with LJ-PME, the dispersion correction is multiplied
2462 * by the difference between the actual C6 and the value of C6
2463 * that would produce the combination rule.
2464 * This means the normal energy and virial difference formulas
2468 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2470 /* Contribution beyond the cut-off */
2471 eners[0] += -4.0*M_PI/(3.0*rc3);
2472 eners[1] += 4.0*M_PI/(9.0*rc9);
2473 if (ic->vdw_modifier == eintmodPOTSHIFT)
2475 /* Contribution within the cut-off */
2476 eners[0] += -4.0*M_PI/(3.0*rc3);
2477 eners[1] += 4.0*M_PI/(3.0*rc9);
2479 /* Contribution beyond the cut-off */
2480 virs[0] += 8.0*M_PI/rc3;
2481 virs[1] += -16.0*M_PI/(3.0*rc9);
2486 "Dispersion correction is not implemented for vdw-type = %s",
2487 evdw_names[ic->vdwtype]);
2490 /* When we deprecate the group kernels the code below can go too */
2491 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2493 /* Calculate self-interaction coefficient (assuming that
2494 * the reciprocal-space contribution is constant in the
2495 * region that contributes to the self-interaction).
2497 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2499 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2500 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2503 fr->enerdiffsix = eners[0];
2504 fr->enerdifftwelve = eners[1];
2505 /* The 0.5 is due to the Gromacs definition of the virial */
2506 fr->virdiffsix = 0.5*virs[0];
2507 fr->virdifftwelve = 0.5*virs[1];
2511 void calc_dispcorr(const t_inputrec *ir, const t_forcerec *fr,
2512 const matrix box, real lambda, tensor pres, tensor virial,
2513 real *prescorr, real *enercorr, real *dvdlcorr)
2515 gmx_bool bCorrAll, bCorrPres;
2516 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2526 if (ir->eDispCorr != edispcNO)
2528 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2529 ir->eDispCorr == edispcAllEnerPres);
2530 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2531 ir->eDispCorr == edispcAllEnerPres);
2533 invvol = 1/det(box);
2536 /* Only correct for the interactions with the inserted molecule */
2537 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2542 dens = fr->numAtomsForDispersionCorrection*invvol;
2543 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2546 if (ir->efep == efepNO)
2548 avcsix = fr->avcsix[0];
2549 avctwelve = fr->avctwelve[0];
2553 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2554 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2557 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2558 *enercorr += avcsix*enerdiff;
2560 if (ir->efep != efepNO)
2562 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2566 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2567 *enercorr += avctwelve*enerdiff;
2568 if (fr->efep != efepNO)
2570 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2576 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2577 if (ir->eDispCorr == edispcAllEnerPres)
2579 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2581 /* The factor 2 is because of the Gromacs virial definition */
2582 spres = -2.0*invvol*svir*PRESFAC;
2584 for (m = 0; m < DIM; m++)
2586 virial[m][m] += svir;
2587 pres[m][m] += spres;
2592 /* Can't currently control when it prints, for now, just print when degugging */
2597 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2603 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2604 *enercorr, spres, svir);
2608 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2612 if (fr->efep != efepNO)
2614 *dvdlcorr += dvdlambda;
2619 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2620 const gmx_mtop_t *mtop, rvec x[],
2626 if (bFirst && fplog)
2628 fprintf(fplog, "Removing pbc first time\n");
2633 for (const gmx_molblock_t &molb : mtop->molblock)
2635 const gmx_moltype_t &moltype = mtop->moltype[molb.type];
2636 if (moltype.atoms.nr == 1 ||
2637 (!bFirst && moltype.cgs.nr == 1))
2639 /* Just one atom or charge group in the molecule, no PBC required */
2640 as += molb.nmol*moltype.atoms.nr;
2644 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2645 mk_graph_ilist(nullptr, moltype.ilist,
2646 0, moltype.atoms.nr, FALSE, FALSE, graph);
2648 for (mol = 0; mol < molb.nmol; mol++)
2650 mk_mshift(fplog, graph, ePBC, box, x+as);
2652 shift_self(graph, box, x+as);
2653 /* The molecule is whole now.
2654 * We don't need the second mk_mshift call as in do_pbc_first,
2655 * since we no longer need this graph.
2658 as += moltype.atoms.nr;
2666 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2667 const gmx_mtop_t *mtop, rvec x[])
2669 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2672 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2673 const gmx_mtop_t *mtop, rvec x[])
2675 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2678 void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
2681 nth = gmx_omp_nthreads_get(emntDefault);
2683 #pragma omp parallel for num_threads(nth) schedule(static)
2684 for (t = 0; t < nth; t++)
2688 size_t natoms = x.size();
2689 size_t offset = (natoms*t )/nth;
2690 size_t len = (natoms*(t + 1))/nth - offset;
2691 put_atoms_in_box(ePBC, box, x.subArray(offset, len));
2693 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2697 // TODO This can be cleaned up a lot, and move back to runner.cpp
2698 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
2699 const t_inputrec *inputrec,
2700 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2701 gmx_walltime_accounting_t walltime_accounting,
2702 nonbonded_verlet_t *nbv,
2703 const gmx_pme_t *pme,
2704 gmx_bool bWriteStat)
2706 t_nrnb *nrnb_tot = nullptr;
2708 double nbfs = 0, mflop = 0;
2709 double elapsed_time,
2710 elapsed_time_over_all_ranks,
2711 elapsed_time_over_all_threads,
2712 elapsed_time_over_all_threads_over_all_ranks;
2713 /* Control whether it is valid to print a report. Only the
2714 simulation master may print, but it should not do so if the run
2715 terminated e.g. before a scheduled reset step. This is
2716 complicated by the fact that PME ranks are unaware of the
2717 reason why they were sent a pmerecvqxFINISH. To avoid
2718 communication deadlocks, we always do the communication for the
2719 report, even if we've decided not to write the report, because
2720 how long it takes to finish the run is not important when we've
2721 decided not to report on the simulation performance.
2723 Further, we only report performance for dynamical integrators,
2724 because those are the only ones for which we plan to
2725 consider doing any optimizations. */
2726 bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
2728 if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
2730 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2731 printReport = false;
2738 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2739 cr->mpi_comm_mysim);
2747 elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
2748 elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
2752 /* reduce elapsed_time over all MPI ranks in the current simulation */
2753 MPI_Allreduce(&elapsed_time,
2754 &elapsed_time_over_all_ranks,
2755 1, MPI_DOUBLE, MPI_SUM,
2756 cr->mpi_comm_mysim);
2757 elapsed_time_over_all_ranks /= cr->nnodes;
2758 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2759 * current simulation. */
2760 MPI_Allreduce(&elapsed_time_over_all_threads,
2761 &elapsed_time_over_all_threads_over_all_ranks,
2762 1, MPI_DOUBLE, MPI_SUM,
2763 cr->mpi_comm_mysim);
2768 elapsed_time_over_all_ranks = elapsed_time;
2769 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2774 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2781 if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
2783 print_dd_statistics(cr, inputrec, fplog);
2786 /* TODO Move the responsibility for any scaling by thread counts
2787 * to the code that handled the thread region, so that there's a
2788 * mechanism to keep cycle counting working during the transition
2789 * to task parallelism. */
2790 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2791 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2792 wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
2793 auto cycle_sum(wallcycle_sum(cr, wcycle));
2797 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2798 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2799 if (pme_gpu_task_enabled(pme))
2801 pme_gpu_get_timings(pme, &pme_gpu_timings);
2803 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2804 elapsed_time_over_all_ranks,
2809 if (EI_DYNAMICS(inputrec->eI))
2811 delta_t = inputrec->delta_t;
2816 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2817 elapsed_time_over_all_ranks,
2818 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2819 delta_t, nbfs, mflop);
2823 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2824 elapsed_time_over_all_ranks,
2825 walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
2826 delta_t, nbfs, mflop);
2831 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2833 /* this function works, but could probably use a logic rewrite to keep all the different
2834 types of efep straight. */
2836 if ((ir->efep == efepNO) && (!ir->bSimTemp))
2841 t_lambda *fep = ir->fepvals;
2842 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2843 if checkpoint is set -- a kludge is in for now
2846 for (int i = 0; i < efptNR; i++)
2848 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2849 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2851 lambda[i] = fep->init_lambda;
2854 lam0[i] = lambda[i];
2859 lambda[i] = fep->all_lambda[i][*fep_state];
2862 lam0[i] = lambda[i];
2868 /* need to rescale control temperatures to match current state */
2869 for (int i = 0; i < ir->opts.ngtc; i++)
2871 if (ir->opts.ref_t[i] > 0)
2873 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2878 /* Send to the log the information on the current lambdas */
2879 if (fplog != nullptr)
2881 fprintf(fplog, "Initial vector of lambda components:[ ");
2882 for (const auto &l : lambda)
2884 fprintf(fplog, "%10.4f ", l);
2886 fprintf(fplog, "]\n");
2891 void init_md(FILE *fplog,
2892 const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2893 t_inputrec *ir, const gmx_output_env_t *oenv,
2894 const MdrunOptions &mdrunOptions,
2895 double *t, double *t0,
2896 t_state *globalState, double *lam0,
2897 t_nrnb *nrnb, gmx_mtop_t *mtop,
2899 gmx::BoxDeformation *deform,
2900 int nfile, const t_filenm fnm[],
2901 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2902 tensor force_vir, tensor shake_vir,
2903 tensor total_vir, tensor pres, rvec mu_tot,
2904 gmx_bool *bSimAnn, t_vcm **vcm,
2905 gmx_wallcycle_t wcycle)
2909 /* Initial values */
2910 *t = *t0 = ir->init_t;
2913 for (i = 0; i < ir->opts.ngtc; i++)
2915 /* set bSimAnn if any group is being annealed */
2916 if (ir->opts.annealing[i] != eannNO)
2922 /* Initialize lambda variables */
2923 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2924 * We currently need to call initialize_lambdas on non-master ranks
2925 * to initialize lam0.
2929 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2934 std::array<real, efptNR> tmpLambda;
2935 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2938 // TODO upd is never NULL in practice, but the analysers don't know that
2941 *upd = init_update(ir, deform);
2945 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2950 *vcm = init_vcm(fplog, &mtop->groups, ir);
2953 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2955 if (ir->etc == etcBERENDSEN)
2957 please_cite(fplog, "Berendsen84a");
2959 if (ir->etc == etcVRESCALE)
2961 please_cite(fplog, "Bussi2007a");
2963 if (ir->eI == eiSD1)
2965 please_cite(fplog, "Goga2012");
2972 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2974 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2975 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2978 /* Initiate variables */
2979 clear_mat(force_vir);
2980 clear_mat(shake_vir);
2982 clear_mat(total_vir);