2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
115 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
117 void print_time(FILE *out,
118 gmx_walltime_accounting_t walltime_accounting,
121 t_commrec gmx_unused *cr)
124 char timebuf[STRLEN];
125 double dt, elapsed_seconds, time_per_step;
134 fprintf(out, "step %s", gmx_step_str(step, buf));
137 if ((step >= ir->nstlist))
139 double seconds_since_epoch = gmx_gettime();
140 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
141 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
142 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
148 finish = (time_t) (seconds_since_epoch + dt);
149 gmx_ctime_r(&finish, timebuf, STRLEN);
150 sprintf(buf, "%s", timebuf);
151 buf[strlen(buf)-1] = '\0';
152 fprintf(out, ", will finish %s", buf);
156 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
161 fprintf(out, " performance: %.1f ns/day ",
162 ir->delta_t/1000*24*60*60/time_per_step);
175 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
178 char time_string[STRLEN];
187 char timebuf[STRLEN];
188 time_t temp_time = (time_t) the_time;
190 gmx_ctime_r(&temp_time, timebuf, STRLEN);
191 for (i = 0; timebuf[i] >= ' '; i++)
193 time_string[i] = timebuf[i];
195 time_string[i] = '\0';
198 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
201 void print_start(FILE *fplog, t_commrec *cr,
202 gmx_walltime_accounting_t walltime_accounting,
207 sprintf(buf, "Started %s", name);
208 print_date_and_time(fplog, cr->nodeid, buf,
209 walltime_accounting_get_start_time_stamp(walltime_accounting));
212 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
214 /* TODO: remove this - 1 when padding is properly implemented */
215 int end = forceToAdd->size() - 1;
216 const rvec *fAdd = as_rvec_array(forceToAdd->data());
218 // cppcheck-suppress unreadVariable
219 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
220 #pragma omp parallel for num_threads(nt) schedule(static)
221 for (int i = 0; i < end; i++)
223 rvec_inc(f[i], fAdd[i]);
227 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
228 tensor vir_part, t_graph *graph, matrix box,
229 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
233 /* The short-range virial from surrounding boxes */
234 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
235 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
237 /* Calculate partial virial, for local atoms only, based on short range.
238 * Total virial is computed in global_stat, called from do_md
240 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
241 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
243 /* Add position restraint contribution */
244 for (i = 0; i < DIM; i++)
246 vir_part[i][i] += fr->vir_diag_posres[i];
249 /* Add wall contribution */
250 for (i = 0; i < DIM; i++)
252 vir_part[i][ZZ] += fr->vir_wall_z[i];
257 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
261 static void pull_potential_wrapper(t_commrec *cr,
263 matrix box, rvec x[],
267 gmx_enerdata_t *enerd,
270 gmx_wallcycle_t wcycle)
275 /* Calculate the center of mass forces, this requires communication,
276 * which is why pull_potential is called close to other communication.
277 * The virial contribution is calculated directly,
278 * which is why we call pull_potential after calc_virial.
280 wallcycle_start(wcycle, ewcPULLPOT);
281 set_pbc(&pbc, ir->ePBC, box);
283 enerd->term[F_COM_PULL] +=
284 pull_potential(ir->pull_work, mdatoms, &pbc,
285 cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
286 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
287 wallcycle_stop(wcycle, ewcPULLPOT);
290 static void pme_receive_force_ener(t_commrec *cr,
291 gmx_wallcycle_t wcycle,
292 gmx_enerdata_t *enerd,
295 real e_q, e_lj, dvdl_q, dvdl_lj;
296 float cycles_ppdpme, cycles_seppme;
298 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
299 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
301 /* In case of node-splitting, the PP nodes receive the long-range
302 * forces, virial and energy from the PME nodes here.
304 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
307 gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
308 fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
310 enerd->term[F_COUL_RECIP] += e_q;
311 enerd->term[F_LJ_RECIP] += e_lj;
312 enerd->dvdl_lin[efptCOUL] += dvdl_q;
313 enerd->dvdl_lin[efptVDW] += dvdl_lj;
317 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
319 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
322 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
323 gmx_int64_t step, real forceTolerance,
324 const rvec *x, const rvec *f)
326 real force2Tolerance = gmx::square(forceTolerance);
327 std::uintmax_t numNonFinite = 0;
328 for (int i = 0; i < md->homenr; i++)
330 real force2 = norm2(f[i]);
331 bool nonFinite = !std::isfinite(force2);
332 if (force2 >= force2Tolerance || nonFinite)
334 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
336 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
343 if (numNonFinite > 0)
345 /* Note that with MPI this fatal call on one rank might interrupt
346 * the printing on other ranks. But we can only avoid that with
347 * an expensive MPI barrier that we would need at each step.
349 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
353 static void post_process_forces(t_commrec *cr,
355 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
357 matrix box, rvec x[],
362 t_forcerec *fr, gmx_vsite_t *vsite,
369 /* Spread the mesh force on virtual sites to the other particles...
370 * This is parallellized. MPI communication is performed
371 * if the constructing atoms aren't local.
373 wallcycle_start(wcycle, ewcVSITESPREAD);
374 spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
375 (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
377 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
378 wallcycle_stop(wcycle, ewcVSITESPREAD);
380 if (flags & GMX_FORCE_VIRIAL)
382 /* Now add the forces, this is local */
383 sum_forces(f, fr->f_novirsum);
385 if (EEL_FULL(fr->ic->eeltype))
387 /* Add the mesh contribution to the virial */
388 m_add(vir_force, fr->vir_el_recip, vir_force);
390 if (EVDW_PME(fr->ic->vdwtype))
392 /* Add the mesh contribution to the virial */
393 m_add(vir_force, fr->vir_lj_recip, vir_force);
397 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
402 if (fr->print_force >= 0)
404 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
408 static void do_nb_verlet(t_forcerec *fr,
409 interaction_const_t *ic,
410 gmx_enerdata_t *enerd,
411 int flags, int ilocality,
415 gmx_wallcycle_t wcycle)
417 if (!(flags & GMX_FORCE_NONBONDED))
419 /* skip non-bonded calculation */
423 nonbonded_verlet_t *nbv = fr->nbv;
424 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
426 /* GPU kernel launch overhead is already timed separately */
427 if (fr->cutoff_scheme != ecutsVERLET)
429 gmx_incons("Invalid cut-off scheme passed!");
432 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
434 if (!bUsingGpuKernels)
436 /* When dynamic pair-list pruning is requested, we need to prune
437 * at nstlistPrune steps.
439 if (nbv->listParams->useDynamicPruning &&
440 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
442 /* Prune the pair-list beyond fr->ic->rlistPrune using
443 * the current coordinates of the atoms.
445 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
446 nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
447 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
450 wallcycle_sub_start(wcycle, ewcsNONBONDED);
453 switch (nbvg->kernel_type)
455 case nbnxnk4x4_PlainC:
456 case nbnxnk4xN_SIMD_4xN:
457 case nbnxnk4xN_SIMD_2xNN:
458 nbnxn_kernel_cpu(nbvg,
465 enerd->grpp.ener[egCOULSR],
467 enerd->grpp.ener[egBHAMSR] :
468 enerd->grpp.ener[egLJSR]);
471 case nbnxnk8x8x8_GPU:
472 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
475 case nbnxnk8x8x8_PlainC:
476 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
483 enerd->grpp.ener[egCOULSR],
485 enerd->grpp.ener[egBHAMSR] :
486 enerd->grpp.ener[egLJSR]);
490 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
493 if (!bUsingGpuKernels)
495 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
498 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
499 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
501 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
503 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
504 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
506 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
510 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
512 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
513 if (flags & GMX_FORCE_ENERGY)
515 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
516 enr_nbnxn_kernel_ljc += 1;
517 enr_nbnxn_kernel_lj += 1;
520 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
521 nbvg->nbl_lists.natpair_ljq);
522 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
523 nbvg->nbl_lists.natpair_lj);
524 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
525 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
526 nbvg->nbl_lists.natpair_q);
528 if (ic->vdw_modifier == eintmodFORCESWITCH)
530 /* We add up the switch cost separately */
531 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
532 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
534 if (ic->vdw_modifier == eintmodPOTSWITCH)
536 /* We add up the switch cost separately */
537 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
538 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
540 if (ic->vdwtype == evdwPME)
542 /* We add up the LJ Ewald cost separately */
543 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
544 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
548 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
555 gmx_enerdata_t *enerd,
558 gmx_wallcycle_t wcycle)
561 nb_kernel_data_t kernel_data;
563 real dvdl_nb[efptNR];
568 /* Add short-range interactions */
569 donb_flags |= GMX_NONBONDED_DO_SR;
571 /* Currently all group scheme kernels always calculate (shift-)forces */
572 if (flags & GMX_FORCE_FORCES)
574 donb_flags |= GMX_NONBONDED_DO_FORCE;
576 if (flags & GMX_FORCE_VIRIAL)
578 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
580 if (flags & GMX_FORCE_ENERGY)
582 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
585 kernel_data.flags = donb_flags;
586 kernel_data.lambda = lambda;
587 kernel_data.dvdl = dvdl_nb;
589 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
590 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
592 /* reset free energy components */
593 for (i = 0; i < efptNR; i++)
598 assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
600 wallcycle_sub_start(wcycle, ewcsNONBONDED);
601 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
602 for (th = 0; th < nbl_lists->nnbl; th++)
606 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
607 x, f, fr, mdatoms, &kernel_data, nrnb);
609 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
612 if (fepvals->sc_alpha != 0)
614 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
615 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
619 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
620 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
623 /* If we do foreign lambda and we have soft-core interactions
624 * we have to recalculate the (non-linear) energies contributions.
626 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
628 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
629 kernel_data.lambda = lam_i;
630 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
631 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
632 /* Note that we add to kernel_data.dvdl, but ignore the result */
634 for (i = 0; i < enerd->n_lambda; i++)
636 for (j = 0; j < efptNR; j++)
638 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
640 reset_foreign_enerdata(enerd);
641 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
642 for (th = 0; th < nbl_lists->nnbl; th++)
646 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
647 x, f, fr, mdatoms, &kernel_data, nrnb);
649 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
652 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
653 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
657 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
660 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
662 return nbv != nullptr && nbv->bUseGPU;
665 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
667 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
669 /* Note that we would like to avoid this conditional by putting it
670 * into the omp pragma instead, but then we still take the full
671 * omp parallel for overhead (at least with gcc5).
675 for (int i = 0; i < n; i++)
682 #pragma omp parallel for num_threads(nth) schedule(static)
683 for (int i = 0; i < n; i++)
690 /*! \brief This routine checks if the potential energy is finite.
692 * Note that passing this check does not guarantee finite forces,
693 * since those use slightly different arithmetics. But in most cases
694 * there is just a narrow coordinate range where forces are not finite
695 * and energies are finite.
697 * \param[in] enerd The energy data; the non-bonded group energies need to be added in here before calling this routine
699 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
701 if (!std::isfinite(enerd->term[F_EPOT]))
703 gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
706 enerd->term[F_COUL_SR]);
710 /*! \brief Compute forces and/or energies for special algorithms
712 * The intention is to collect all calls to algorithms that compute
713 * forces on local atoms only and that do not contribute to the local
714 * virial sum (but add their virial contribution separately).
715 * Eventually these should likely all become ForceProviders.
716 * Within this function the intention is to have algorithms that do
717 * global communication at the end, so global barriers within the MD loop
718 * are as close together as possible.
720 * \param[in] cr The communication record
721 * \param[in] inputrec The input record
722 * \param[in] step The current MD step
723 * \param[in] t The current time
724 * \param[in,out] wcycle Wallcycle accounting struct
725 * \param[in,out] forceProviders Pointer to a list of force providers
726 * \param[in] box The unit cell
727 * \param[in] x The coordinates
728 * \param[in] mdatoms Per atom properties
729 * \param[in] lambda Array of free-energy lambda values
730 * \param[in] forceFlags Flags that tell whether we should compute forces/energies/virial
731 * \param[in,out] force Force buffer; does not contribute to the virial
732 * \param[in,out] virial Virial buffer
733 * \param[in,out] enerd Energy buffer
734 * \param[in,out] ed Essential dynamics pointer
735 * \param[in] bNS Tells if we did neighbor searching this step, used for ED sampling
737 * \todo Remove bNS, which is used incorrectly.
738 * \todo Convert all other algorithms called here to ForceProviders.
741 computeSpecialForces(t_commrec *cr,
742 t_inputrec *inputrec,
745 gmx_wallcycle_t wcycle,
746 ForceProviders *forceProviders,
752 PaddedRVecVector *force,
754 gmx_enerdata_t *enerd,
758 const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
760 /* NOTE: Currently all ForceProviders only provide forces.
761 * When they also provide energies, remove this conditional.
765 /* Collect forces from modules */
766 gmx::ArrayRef<gmx::RVec> f = *force;
768 forceProviders->calculateForces(cr, mdatoms, box, t, x, f);
771 rvec *f = as_rvec_array(force->data());
773 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
775 pull_potential_wrapper(cr, inputrec, box, x,
776 f, virial, mdatoms, enerd, lambda, t,
780 /* Add the forces from enforced rotation potentials (if any) */
783 wallcycle_start(wcycle, ewcROTadd);
784 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
785 wallcycle_stop(wcycle, ewcROTadd);
790 /* Note that since init_edsam() is called after the initialization
791 * of forcerec, edsam doesn't request the noVirSum force buffer.
792 * Thus if no other algorithm (e.g. PME) requires it, the forces
793 * here will contribute to the virial.
795 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
798 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
799 if (inputrec->bIMD && computeForces)
801 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
805 static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
806 t_inputrec *inputrec,
807 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
809 gmx_groups_t gmx_unused *groups,
810 matrix box, rvec x[], history_t *hist,
811 PaddedRVecVector *force,
814 gmx_enerdata_t *enerd, t_fcdata *fcd,
815 real *lambda, t_graph *graph,
816 t_forcerec *fr, interaction_const_t *ic,
817 gmx_vsite_t *vsite, rvec mu_tot,
818 double t, gmx_edsam_t ed,
821 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
822 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
826 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
827 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
828 rvec vzero, box_diag;
829 float cycles_pme, cycles_wait_gpu;
830 nonbonded_verlet_t *nbv = fr->nbv;
832 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
833 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
834 bFillGrid = (bNS && bStateChanged);
835 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
836 bDoForces = (flags & GMX_FORCE_FORCES);
837 bUseGPU = fr->nbv->bUseGPU;
838 bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
840 /* At a search step we need to start the first balancing region
841 * somewhere early inside the step after communication during domain
842 * decomposition (and not during the previous step as usual).
845 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
847 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
853 const int homenr = mdatoms->homenr;
855 clear_mat(vir_force);
857 if (DOMAINDECOMP(cr))
859 cg1 = cr->dd->ncg_tot;
872 update_forcerec(fr, box);
874 if (inputrecNeedMutot(inputrec))
876 /* Calculate total (local) dipole moment in a temporary common array.
877 * This makes it possible to sum them over nodes faster.
879 calc_mu(start, homenr,
880 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
885 if (fr->ePBC != epbcNONE)
887 /* Compute shift vectors every step,
888 * because of pressure coupling or box deformation!
890 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
892 calc_shifts(box, fr->shift_vec);
897 put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
898 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
900 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
902 unshift_self(graph, box, x);
906 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
907 fr->shift_vec, nbv->nbat);
910 if (!(cr->duty & DUTY_PME))
912 /* Send particle coordinates to the pme nodes.
913 * Since this is only implemented for domain decomposition
914 * and domain decomposition does not use the graph,
915 * we do not need to worry about shifting.
917 wallcycle_start(wcycle, ewcPP_PMESENDX);
918 gmx_pme_send_coordinates(cr, box, x,
919 lambda[efptCOUL], lambda[efptVDW],
920 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
922 wallcycle_stop(wcycle, ewcPP_PMESENDX);
926 /* do gridding for pair search */
929 if (graph && bStateChanged)
931 /* Calculate intramolecular shift vectors to make molecules whole */
932 mk_mshift(fplog, graph, fr->ePBC, box, x);
936 box_diag[XX] = box[XX][XX];
937 box_diag[YY] = box[YY][YY];
938 box_diag[ZZ] = box[ZZ][ZZ];
940 wallcycle_start(wcycle, ewcNS);
943 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
944 nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
946 0, mdatoms->homenr, -1, fr->cginfo, x,
948 nbv->grp[eintLocal].kernel_type,
950 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
954 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
955 nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
957 nbv->grp[eintNonlocal].kernel_type,
959 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
962 nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
963 wallcycle_stop(wcycle, ewcNS);
966 /* initialize the GPU atom data and copy shift vector */
969 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
970 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
974 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
977 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
979 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
980 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
983 /* do local pair search */
986 wallcycle_start_nocount(wcycle, ewcNS);
987 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
988 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
990 nbv->listParams->rlistOuter,
991 nbv->min_ci_balanced,
992 &nbv->grp[eintLocal].nbl_lists,
994 nbv->grp[eintLocal].kernel_type,
996 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
997 if (nbv->listParams->useDynamicPruning && !bUseGPU)
999 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
1001 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
1005 /* initialize local pair-list on the GPU */
1006 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1007 nbv->grp[eintLocal].nbl_lists.nbl[0],
1010 wallcycle_stop(wcycle, ewcNS);
1014 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1015 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1016 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
1018 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1019 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1024 if (DOMAINDECOMP(cr))
1026 ddOpenBalanceRegionGpu(cr->dd);
1029 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1030 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1031 /* launch local nonbonded work on GPU */
1032 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
1033 step, nrnb, wcycle);
1034 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1035 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1038 /* Communicate coordinates and sum dipole if necessary +
1039 do non-local pair search */
1040 if (DOMAINDECOMP(cr))
1044 wallcycle_start_nocount(wcycle, ewcNS);
1045 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1047 nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
1049 nbv->listParams->rlistOuter,
1050 nbv->min_ci_balanced,
1051 &nbv->grp[eintNonlocal].nbl_lists,
1053 nbv->grp[eintNonlocal].kernel_type,
1055 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1056 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1058 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1060 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1062 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1064 /* initialize non-local pair-list on the GPU */
1065 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1066 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1069 wallcycle_stop(wcycle, ewcNS);
1073 wallcycle_start(wcycle, ewcMOVEX);
1074 dd_move_x(cr->dd, box, x);
1075 wallcycle_stop(wcycle, ewcMOVEX);
1077 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1078 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1079 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1081 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1082 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1087 wallcycle_start(wcycle, ewcLAUNCH_GPU);
1088 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1089 /* launch non-local nonbonded tasks on GPU */
1090 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1091 step, nrnb, wcycle);
1092 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1093 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1099 /* launch D2H copy-back F */
1100 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1101 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1102 if (DOMAINDECOMP(cr))
1104 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1105 flags, eatNonlocal);
1107 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
1109 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1110 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1113 if (bStateChanged && inputrecNeedMutot(inputrec))
1117 gmx_sumd(2*DIM, mu, cr);
1118 ddReopenBalanceRegionCpu(cr->dd);
1121 for (i = 0; i < 2; i++)
1123 for (j = 0; j < DIM; j++)
1125 fr->mu_tot[i][j] = mu[i*DIM + j];
1129 if (fr->efep == efepNO)
1131 copy_rvec(fr->mu_tot[0], mu_tot);
1135 for (j = 0; j < DIM; j++)
1138 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1139 lambda[efptCOUL]*fr->mu_tot[1][j];
1143 /* Reset energies */
1144 reset_enerdata(enerd);
1145 clear_rvecs(SHIFTS, fr->fshift);
1147 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1149 wallcycle_start(wcycle, ewcPPDURINGPME);
1150 dd_force_flop_start(cr->dd, nrnb);
1155 wallcycle_start(wcycle, ewcROT);
1156 do_rotation(cr, inputrec, box, x, t, step, bNS);
1157 wallcycle_stop(wcycle, ewcROT);
1160 /* Temporary solution until all routines take PaddedRVecVector */
1161 rvec *f = as_rvec_array(force->data());
1163 /* Start the force cycle counter.
1164 * Note that a different counter is used for dynamic load balancing.
1166 wallcycle_start(wcycle, ewcFORCE);
1167 fr->f_novirsum = force;
1170 /* If we need to compute the virial, we might need a separate
1171 * force buffer for algorithms for which the virial is calculated
1172 * separately, such as PME.
1174 if (fr->bF_NoVirSum && (flags & GMX_FORCE_VIRIAL))
1176 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1178 /* TODO: remove this - 1 when padding is properly implemented */
1179 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1180 as_rvec_array(fr->f_novirsum->data()));
1182 /* Clear the short- and long-range forces */
1183 clear_rvecs_omp(fr->natoms_force_constr, f);
1185 clear_rvec(fr->vir_diag_posres);
1188 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1190 clear_pull_forces(inputrec->pull_work);
1193 /* We calculate the non-bonded forces, when done on the CPU, here.
1194 * We do this before calling do_force_lowlevel, because in that
1195 * function, the listed forces are calculated before PME, which
1196 * does communication. With this order, non-bonded and listed
1197 * force calculation imbalance can be balanced out by the domain
1198 * decomposition load balancing.
1203 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1204 step, nrnb, wcycle);
1207 if (fr->efep != efepNO)
1209 /* Calculate the local and non-local free energy interactions here.
1210 * Happens here on the CPU both with and without GPU.
1212 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1214 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1216 inputrec->fepvals, lambda,
1217 enerd, flags, nrnb, wcycle);
1220 if (DOMAINDECOMP(cr) &&
1221 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1223 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1225 inputrec->fepvals, lambda,
1226 enerd, flags, nrnb, wcycle);
1234 if (DOMAINDECOMP(cr))
1236 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1237 step, nrnb, wcycle);
1246 aloc = eintNonlocal;
1249 /* Add all the non-bonded force to the normal force array.
1250 * This can be split into a local and a non-local part when overlapping
1251 * communication with calculation with domain decomposition.
1253 wallcycle_stop(wcycle, ewcFORCE);
1254 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1255 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1256 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
1257 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1258 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1259 wallcycle_start_nocount(wcycle, ewcFORCE);
1261 /* if there are multiple fshift output buffers reduce them */
1262 if ((flags & GMX_FORCE_VIRIAL) &&
1263 nbv->grp[aloc].nbl_lists.nnbl > 1)
1265 /* This is not in a subcounter because it takes a
1266 negligible and constant-sized amount of time */
1267 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
1272 /* update QMMMrec, if necessary */
1275 update_QMMMrec(cr, fr, x, mdatoms, box);
1278 /* Compute the bonded and non-bonded energies and optionally forces */
1279 do_force_lowlevel(fr, inputrec, &(top->idef),
1280 cr, nrnb, wcycle, mdatoms,
1281 x, hist, f, enerd, fcd, top, fr->born,
1283 inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1284 flags, &cycles_pme);
1286 wallcycle_stop(wcycle, ewcFORCE);
1288 computeSpecialForces(cr, inputrec, step, t, wcycle,
1289 fr->forceProviders, box, x, mdatoms, lambda,
1290 flags, fr->f_novirsum, vir_force, enerd,
1295 /* wait for non-local forces (or calculate in emulation mode) */
1296 if (DOMAINDECOMP(cr))
1300 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1301 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1303 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1305 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1309 wallcycle_start_nocount(wcycle, ewcFORCE);
1310 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1311 step, nrnb, wcycle);
1312 wallcycle_stop(wcycle, ewcFORCE);
1314 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1315 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1316 /* skip the reduction if there was no non-local work to do */
1317 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1319 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1322 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1323 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1327 if (DOMAINDECOMP(cr))
1329 /* We are done with the CPU compute.
1330 * We will now communicate the non-local forces.
1331 * If we use a GPU this will overlap with GPU work, so in that case
1332 * we do not close the DD force balancing region here.
1334 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1336 ddCloseBalanceRegionCpu(cr->dd);
1340 wallcycle_start(wcycle, ewcMOVEF);
1341 dd_move_f(cr->dd, f, fr->fshift);
1342 wallcycle_stop(wcycle, ewcMOVEF);
1348 /* wait for local forces (or calculate in emulation mode) */
1351 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1352 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1353 * but even with a step of 0.1 ms the difference is less than 1%
1356 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1358 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1359 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1361 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1363 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1365 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1367 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1368 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1370 /* We measured few cycles, it could be that the kernel
1371 * and transfer finished earlier and there was no actual
1372 * wait time, only API call overhead.
1373 * Then the actual time could be anywhere between 0 and
1374 * cycles_wait_est. We will use half of cycles_wait_est.
1376 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1378 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1381 /* now clear the GPU outputs while we finish the step on the CPU */
1382 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
1383 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1384 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1386 /* Is dynamic pair-list pruning activated? */
1387 if (nbv->listParams->useDynamicPruning)
1389 /* We should not launch the rolling pruning kernel at a search
1390 * step or just before search steps, since that's useless.
1391 * Without domain decomposition we prune at even steps.
1392 * With domain decomposition we alternate local and non-local
1393 * pruning at even and odd steps.
1395 int numRollingParts = nbv->listParams->numRollingParts;
1396 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1397 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1398 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1399 if (stepWithCurrentList > 0 &&
1400 stepWithCurrentList < inputrec->nstlist - 1 &&
1401 (stepIsEven || DOMAINDECOMP(cr)))
1403 nbnxn_gpu_launch_kernel_pruneonly(fr->nbv->gpu_nbv,
1404 stepIsEven ? eintLocal : eintNonlocal,
1408 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
1409 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
1413 wallcycle_start_nocount(wcycle, ewcFORCE);
1414 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1415 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1416 step, nrnb, wcycle);
1417 wallcycle_stop(wcycle, ewcFORCE);
1419 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1420 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1421 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1423 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1424 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1427 if (DOMAINDECOMP(cr))
1429 dd_force_flop_stop(cr->dd, nrnb);
1434 /* If we have NoVirSum forces, but we do not calculate the virial,
1435 * we sum fr->f_novirsum=f later.
1437 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1439 wallcycle_start(wcycle, ewcVSITESPREAD);
1440 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1441 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1442 wallcycle_stop(wcycle, ewcVSITESPREAD);
1445 if (flags & GMX_FORCE_VIRIAL)
1447 /* Calculation of the virial must be done after vsites! */
1448 calc_virial(0, mdatoms->homenr, x, f,
1449 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1453 if (PAR(cr) && !(cr->duty & DUTY_PME))
1455 /* In case of node-splitting, the PP nodes receive the long-range
1456 * forces, virial and energy from the PME nodes here.
1458 pme_receive_force_ener(cr, wcycle, enerd, fr);
1463 post_process_forces(cr, step, nrnb, wcycle,
1464 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1468 if (flags & GMX_FORCE_ENERGY)
1470 /* Sum the potential energy terms from group contributions */
1471 sum_epot(&(enerd->grpp), enerd->term);
1473 if (!EI_TPI(inputrec->eI))
1475 checkPotentialEnergyValidity(enerd);
1480 static void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1481 t_inputrec *inputrec,
1482 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1483 gmx_localtop_t *top,
1484 gmx_groups_t *groups,
1485 matrix box, rvec x[], history_t *hist,
1486 PaddedRVecVector *force,
1489 gmx_enerdata_t *enerd, t_fcdata *fcd,
1490 real *lambda, t_graph *graph,
1491 t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1492 double t, gmx_edsam_t ed,
1493 gmx_bool bBornRadii,
1495 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1496 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1500 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1504 const int start = 0;
1505 const int homenr = mdatoms->homenr;
1507 clear_mat(vir_force);
1510 if (DOMAINDECOMP(cr))
1512 cg1 = cr->dd->ncg_tot;
1523 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1524 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1525 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1526 bFillGrid = (bNS && bStateChanged);
1527 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1528 bDoForces = (flags & GMX_FORCE_FORCES);
1532 update_forcerec(fr, box);
1534 if (inputrecNeedMutot(inputrec))
1536 /* Calculate total (local) dipole moment in a temporary common array.
1537 * This makes it possible to sum them over nodes faster.
1539 calc_mu(start, homenr,
1540 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1545 if (fr->ePBC != epbcNONE)
1547 /* Compute shift vectors every step,
1548 * because of pressure coupling or box deformation!
1550 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1552 calc_shifts(box, fr->shift_vec);
1557 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1558 &(top->cgs), x, fr->cg_cm);
1559 inc_nrnb(nrnb, eNR_CGCM, homenr);
1560 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1562 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1564 unshift_self(graph, box, x);
1569 calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1570 inc_nrnb(nrnb, eNR_CGCM, homenr);
1573 if (bCalcCGCM && gmx_debug_at)
1575 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1579 if (!(cr->duty & DUTY_PME))
1581 /* Send particle coordinates to the pme nodes.
1582 * Since this is only implemented for domain decomposition
1583 * and domain decomposition does not use the graph,
1584 * we do not need to worry about shifting.
1586 wallcycle_start(wcycle, ewcPP_PMESENDX);
1587 gmx_pme_send_coordinates(cr, box, x,
1588 lambda[efptCOUL], lambda[efptVDW],
1589 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1591 wallcycle_stop(wcycle, ewcPP_PMESENDX);
1593 #endif /* GMX_MPI */
1595 /* Communicate coordinates and sum dipole if necessary */
1596 if (DOMAINDECOMP(cr))
1598 wallcycle_start(wcycle, ewcMOVEX);
1599 dd_move_x(cr->dd, box, x);
1600 wallcycle_stop(wcycle, ewcMOVEX);
1601 /* No GPU support, no move_x overlap, so reopen the balance region here */
1602 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1604 ddReopenBalanceRegionCpu(cr->dd);
1608 if (inputrecNeedMutot(inputrec))
1614 gmx_sumd(2*DIM, mu, cr);
1615 ddReopenBalanceRegionCpu(cr->dd);
1617 for (i = 0; i < 2; i++)
1619 for (j = 0; j < DIM; j++)
1621 fr->mu_tot[i][j] = mu[i*DIM + j];
1625 if (fr->efep == efepNO)
1627 copy_rvec(fr->mu_tot[0], mu_tot);
1631 for (j = 0; j < DIM; j++)
1634 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1639 /* Reset energies */
1640 reset_enerdata(enerd);
1641 clear_rvecs(SHIFTS, fr->fshift);
1645 wallcycle_start(wcycle, ewcNS);
1647 if (graph && bStateChanged)
1649 /* Calculate intramolecular shift vectors to make molecules whole */
1650 mk_mshift(fplog, graph, fr->ePBC, box, x);
1653 /* Do the actual neighbour searching */
1655 groups, top, mdatoms,
1656 cr, nrnb, bFillGrid);
1658 wallcycle_stop(wcycle, ewcNS);
1661 if (inputrec->implicit_solvent && bNS)
1663 make_gb_nblist(cr, inputrec->gb_algorithm,
1664 x, box, fr, &top->idef, graph, fr->born);
1667 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1669 wallcycle_start(wcycle, ewcPPDURINGPME);
1670 dd_force_flop_start(cr->dd, nrnb);
1675 wallcycle_start(wcycle, ewcROT);
1676 do_rotation(cr, inputrec, box, x, t, step, bNS);
1677 wallcycle_stop(wcycle, ewcROT);
1680 /* Temporary solution until all routines take PaddedRVecVector */
1681 rvec *f = as_rvec_array(force->data());
1683 /* Start the force cycle counter.
1684 * Note that a different counter is used for dynamic load balancing.
1686 wallcycle_start(wcycle, ewcFORCE);
1688 fr->f_novirsum = force;
1691 /* If we need to compute the virial, we might need a separate
1692 * force buffer for algorithms for which the virial is calculated
1693 * separately, such as PME.
1695 if (fr->bF_NoVirSum && (flags & GMX_FORCE_VIRIAL))
1697 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1699 /* TODO: remove this - 1 when padding is properly implemented */
1700 clear_rvecs(fr->f_novirsum->size() - 1,
1701 as_rvec_array(fr->f_novirsum->data()));
1704 /* Clear the short- and long-range forces */
1705 clear_rvecs(fr->natoms_force_constr, f);
1707 clear_rvec(fr->vir_diag_posres);
1709 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1711 clear_pull_forces(inputrec->pull_work);
1714 /* update QMMMrec, if necessary */
1717 update_QMMMrec(cr, fr, x, mdatoms, box);
1720 /* Compute the bonded and non-bonded energies and optionally forces */
1721 do_force_lowlevel(fr, inputrec, &(top->idef),
1722 cr, nrnb, wcycle, mdatoms,
1723 x, hist, f, enerd, fcd, top, fr->born,
1725 inputrec->fepvals, lambda,
1726 graph, &(top->excls), fr->mu_tot,
1730 wallcycle_stop(wcycle, ewcFORCE);
1732 if (DOMAINDECOMP(cr))
1734 dd_force_flop_stop(cr->dd, nrnb);
1736 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1738 ddCloseBalanceRegionCpu(cr->dd);
1742 computeSpecialForces(cr, inputrec, step, t, wcycle,
1743 fr->forceProviders, box, x, mdatoms, lambda,
1744 flags, fr->f_novirsum, vir_force, enerd,
1749 /* Communicate the forces */
1750 if (DOMAINDECOMP(cr))
1752 wallcycle_start(wcycle, ewcMOVEF);
1753 dd_move_f(cr->dd, f, fr->fshift);
1754 /* Do we need to communicate the separate force array
1755 * for terms that do not contribute to the single sum virial?
1756 * Position restraints and electric fields do not introduce
1757 * inter-cg forces, only full electrostatics methods do.
1758 * When we do not calculate the virial, fr->f_novirsum = f,
1759 * so we have already communicated these forces.
1761 if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
1762 (flags & GMX_FORCE_VIRIAL))
1764 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1766 wallcycle_stop(wcycle, ewcMOVEF);
1769 /* If we have NoVirSum forces, but we do not calculate the virial,
1770 * we sum fr->f_novirsum=f later.
1772 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1774 wallcycle_start(wcycle, ewcVSITESPREAD);
1775 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1776 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1777 wallcycle_stop(wcycle, ewcVSITESPREAD);
1780 if (flags & GMX_FORCE_VIRIAL)
1782 /* Calculation of the virial must be done after vsites! */
1783 calc_virial(0, mdatoms->homenr, x, f,
1784 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1788 if (PAR(cr) && !(cr->duty & DUTY_PME))
1790 /* In case of node-splitting, the PP nodes receive the long-range
1791 * forces, virial and energy from the PME nodes here.
1793 pme_receive_force_ener(cr, wcycle, enerd, fr);
1798 post_process_forces(cr, step, nrnb, wcycle,
1799 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1803 if (flags & GMX_FORCE_ENERGY)
1805 /* Sum the potential energy terms from group contributions */
1806 sum_epot(&(enerd->grpp), enerd->term);
1808 if (!EI_TPI(inputrec->eI))
1810 checkPotentialEnergyValidity(enerd);
1816 void do_force(FILE *fplog, t_commrec *cr,
1817 t_inputrec *inputrec,
1818 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1819 gmx_localtop_t *top,
1820 gmx_groups_t *groups,
1821 matrix box, PaddedRVecVector *coordinates, history_t *hist,
1822 PaddedRVecVector *force,
1825 gmx_enerdata_t *enerd, t_fcdata *fcd,
1826 gmx::ArrayRef<real> lambda, t_graph *graph,
1828 gmx_vsite_t *vsite, rvec mu_tot,
1829 double t, gmx_edsam_t ed,
1830 gmx_bool bBornRadii,
1832 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1833 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1835 /* modify force flag if not doing nonbonded */
1836 if (!fr->bNonbonded)
1838 flags &= ~GMX_FORCE_NONBONDED;
1841 GMX_ASSERT(coordinates->size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
1842 GMX_ASSERT(force->size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
1844 rvec *x = as_rvec_array(coordinates->data());
1846 switch (inputrec->cutoff_scheme)
1849 do_force_cutsVERLET(fplog, cr, inputrec,
1857 lambda.data(), graph,
1863 ddOpenBalanceRegion,
1864 ddCloseBalanceRegion);
1867 do_force_cutsGROUP(fplog, cr, inputrec,
1875 lambda.data(), graph,
1880 ddOpenBalanceRegion,
1881 ddCloseBalanceRegion);
1884 gmx_incons("Invalid cut-off scheme passed!");
1887 /* In case we don't have constraints and are using GPUs, the next balancing
1888 * region starts here.
1889 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1890 * virial calculation and COM pulling, is not thus not included in
1891 * the balance timing, which is ok as most tasks do communication.
1893 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1895 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1900 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1901 t_inputrec *ir, t_mdatoms *md,
1902 t_state *state, t_commrec *cr, t_nrnb *nrnb,
1903 t_forcerec *fr, gmx_localtop_t *top)
1905 int i, m, start, end;
1907 real dt = ir->delta_t;
1911 /* We need to allocate one element extra, since we might use
1912 * (unaligned) 4-wide SIMD loads to access rvec entries.
1914 snew(savex, state->natoms + 1);
1921 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1922 start, md->homenr, end);
1924 /* Do a first constrain to reset particles... */
1925 step = ir->init_step;
1928 char buf[STEPSTRSIZE];
1929 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1930 gmx_step_str(step, buf));
1934 /* constrain the current position */
1935 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1936 ir, cr, step, 0, 1.0, md,
1937 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1938 fr->bMolPBC, state->box,
1939 state->lambda[efptBONDED], &dvdl_dum,
1940 nullptr, nullptr, nrnb, econqCoord);
1943 /* constrain the inital velocity, and save it */
1944 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1945 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1946 ir, cr, step, 0, 1.0, md,
1947 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1948 fr->bMolPBC, state->box,
1949 state->lambda[efptBONDED], &dvdl_dum,
1950 nullptr, nullptr, nrnb, econqVeloc);
1952 /* constrain the inital velocities at t-dt/2 */
1953 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1955 for (i = start; (i < end); i++)
1957 for (m = 0; (m < DIM); m++)
1959 /* Reverse the velocity */
1960 state->v[i][m] = -state->v[i][m];
1961 /* Store the position at t-dt in buf */
1962 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1965 /* Shake the positions at t=-dt with the positions at t=0
1966 * as reference coordinates.
1970 char buf[STEPSTRSIZE];
1971 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1972 gmx_step_str(step, buf));
1975 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1976 ir, cr, step, -1, 1.0, md,
1977 as_rvec_array(state->x.data()), savex, nullptr,
1978 fr->bMolPBC, state->box,
1979 state->lambda[efptBONDED], &dvdl_dum,
1980 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
1982 for (i = start; i < end; i++)
1984 for (m = 0; m < DIM; m++)
1986 /* Re-reverse the velocities */
1987 state->v[i][m] = -state->v[i][m];
1996 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
1997 double *enerout, double *virout)
1999 double enersum, virsum;
2000 double invscale, invscale2, invscale3;
2001 double r, ea, eb, ec, pa, pb, pc, pd;
2006 invscale = 1.0/scale;
2007 invscale2 = invscale*invscale;
2008 invscale3 = invscale*invscale2;
2010 /* Following summation derived from cubic spline definition,
2011 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2012 * the cubic spline. We first calculate the negative of the
2013 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2014 * add the more standard, abrupt cutoff correction to that result,
2015 * yielding the long-range correction for a switched function. We
2016 * perform both the pressure and energy loops at the same time for
2017 * simplicity, as the computational cost is low. */
2021 /* Since the dispersion table has been scaled down a factor
2022 * 6.0 and the repulsion a factor 12.0 to compensate for the
2023 * c6/c12 parameters inside nbfp[] being scaled up (to save
2024 * flops in kernels), we need to correct for this.
2035 for (ri = rstart; ri < rend; ++ri)
2039 eb = 2.0*invscale2*r;
2043 pb = 3.0*invscale2*r;
2044 pc = 3.0*invscale*r*r;
2047 /* this "8" is from the packing in the vdwtab array - perhaps
2048 should be defined? */
2050 offset = 8*ri + offstart;
2051 y0 = vdwtab[offset];
2052 f = vdwtab[offset+1];
2053 g = vdwtab[offset+2];
2054 h = vdwtab[offset+3];
2056 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2057 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2059 *enerout = 4.0*M_PI*enersum*tabfactor;
2060 *virout = 4.0*M_PI*virsum*tabfactor;
2063 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2065 double eners[2], virs[2], enersum, virsum;
2066 double r0, rc3, rc9;
2068 real scale, *vdwtab;
2070 fr->enershiftsix = 0;
2071 fr->enershifttwelve = 0;
2072 fr->enerdiffsix = 0;
2073 fr->enerdifftwelve = 0;
2075 fr->virdifftwelve = 0;
2077 const interaction_const_t *ic = fr->ic;
2079 if (eDispCorr != edispcNO)
2081 for (i = 0; i < 2; i++)
2086 if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
2087 (ic->vdw_modifier == eintmodPOTSWITCH) ||
2088 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2089 (ic->vdwtype == evdwSHIFT) ||
2090 (ic->vdwtype == evdwSWITCH))
2092 if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
2093 (ic->vdw_modifier == eintmodFORCESWITCH) ||
2094 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
2097 "With dispersion correction rvdw-switch can not be zero "
2098 "for vdw-type = %s", evdw_names[ic->vdwtype]);
2101 /* TODO This code depends on the logic in tables.c that
2102 constructs the table layout, which should be made
2103 explicit in future cleanup. */
2104 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2105 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2106 scale = fr->dispersionCorrectionTable->scale;
2107 vdwtab = fr->dispersionCorrectionTable->data;
2109 /* Round the cut-offs to exact table values for precision */
2110 ri0 = static_cast<int>(floor(ic->rvdw_switch*scale));
2111 ri1 = static_cast<int>(ceil(ic->rvdw*scale));
2113 /* The code below has some support for handling force-switching, i.e.
2114 * when the force (instead of potential) is switched over a limited
2115 * region. This leads to a constant shift in the potential inside the
2116 * switching region, which we can handle by adding a constant energy
2117 * term in the force-switch case just like when we do potential-shift.
2119 * For now this is not enabled, but to keep the functionality in the
2120 * code we check separately for switch and shift. When we do force-switch
2121 * the shifting point is rvdw_switch, while it is the cutoff when we
2122 * have a classical potential-shift.
2124 * For a pure potential-shift the potential has a constant shift
2125 * all the way out to the cutoff, and that is it. For other forms
2126 * we need to calculate the constant shift up to the point where we
2127 * start modifying the potential.
2129 ri0 = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2135 if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
2136 (ic->vdwtype == evdwSHIFT))
2138 /* Determine the constant energy shift below rvdw_switch.
2139 * Table has a scale factor since we have scaled it down to compensate
2140 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2142 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2143 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2145 else if (ic->vdw_modifier == eintmodPOTSHIFT)
2147 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2148 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2151 /* Add the constant part from 0 to rvdw_switch.
2152 * This integration from 0 to rvdw_switch overcounts the number
2153 * of interactions by 1, as it also counts the self interaction.
2154 * We will correct for this later.
2156 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2157 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2159 /* Calculate the contribution in the range [r0,r1] where we
2160 * modify the potential. For a pure potential-shift modifier we will
2161 * have ri0==ri1, and there will not be any contribution here.
2163 for (i = 0; i < 2; i++)
2167 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2168 eners[i] -= enersum;
2172 /* Alright: Above we compensated by REMOVING the parts outside r0
2173 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2175 * Regardless of whether r0 is the point where we start switching,
2176 * or the cutoff where we calculated the constant shift, we include
2177 * all the parts we are missing out to infinity from r0 by
2178 * calculating the analytical dispersion correction.
2180 eners[0] += -4.0*M_PI/(3.0*rc3);
2181 eners[1] += 4.0*M_PI/(9.0*rc9);
2182 virs[0] += 8.0*M_PI/rc3;
2183 virs[1] += -16.0*M_PI/(3.0*rc9);
2185 else if (ic->vdwtype == evdwCUT ||
2186 EVDW_PME(ic->vdwtype) ||
2187 ic->vdwtype == evdwUSER)
2189 if (ic->vdwtype == evdwUSER && fplog)
2192 "WARNING: using dispersion correction with user tables\n");
2195 /* Note that with LJ-PME, the dispersion correction is multiplied
2196 * by the difference between the actual C6 and the value of C6
2197 * that would produce the combination rule.
2198 * This means the normal energy and virial difference formulas
2202 rc3 = ic->rvdw*ic->rvdw*ic->rvdw;
2204 /* Contribution beyond the cut-off */
2205 eners[0] += -4.0*M_PI/(3.0*rc3);
2206 eners[1] += 4.0*M_PI/(9.0*rc9);
2207 if (ic->vdw_modifier == eintmodPOTSHIFT)
2209 /* Contribution within the cut-off */
2210 eners[0] += -4.0*M_PI/(3.0*rc3);
2211 eners[1] += 4.0*M_PI/(3.0*rc9);
2213 /* Contribution beyond the cut-off */
2214 virs[0] += 8.0*M_PI/rc3;
2215 virs[1] += -16.0*M_PI/(3.0*rc9);
2220 "Dispersion correction is not implemented for vdw-type = %s",
2221 evdw_names[ic->vdwtype]);
2224 /* When we deprecate the group kernels the code below can go too */
2225 if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2227 /* Calculate self-interaction coefficient (assuming that
2228 * the reciprocal-space contribution is constant in the
2229 * region that contributes to the self-interaction).
2231 fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
2233 eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
2234 virs[0] += gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
2237 fr->enerdiffsix = eners[0];
2238 fr->enerdifftwelve = eners[1];
2239 /* The 0.5 is due to the Gromacs definition of the virial */
2240 fr->virdiffsix = 0.5*virs[0];
2241 fr->virdifftwelve = 0.5*virs[1];
2245 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2246 matrix box, real lambda, tensor pres, tensor virial,
2247 real *prescorr, real *enercorr, real *dvdlcorr)
2249 gmx_bool bCorrAll, bCorrPres;
2250 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2260 if (ir->eDispCorr != edispcNO)
2262 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2263 ir->eDispCorr == edispcAllEnerPres);
2264 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2265 ir->eDispCorr == edispcAllEnerPres);
2267 invvol = 1/det(box);
2270 /* Only correct for the interactions with the inserted molecule */
2271 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2276 dens = fr->numAtomsForDispersionCorrection*invvol;
2277 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2280 if (ir->efep == efepNO)
2282 avcsix = fr->avcsix[0];
2283 avctwelve = fr->avctwelve[0];
2287 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2288 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2291 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2292 *enercorr += avcsix*enerdiff;
2294 if (ir->efep != efepNO)
2296 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2300 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2301 *enercorr += avctwelve*enerdiff;
2302 if (fr->efep != efepNO)
2304 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2310 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2311 if (ir->eDispCorr == edispcAllEnerPres)
2313 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2315 /* The factor 2 is because of the Gromacs virial definition */
2316 spres = -2.0*invvol*svir*PRESFAC;
2318 for (m = 0; m < DIM; m++)
2320 virial[m][m] += svir;
2321 pres[m][m] += spres;
2326 /* Can't currently control when it prints, for now, just print when degugging */
2331 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2337 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2338 *enercorr, spres, svir);
2342 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2346 if (fr->efep != efepNO)
2348 *dvdlcorr += dvdlambda;
2353 static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2354 const gmx_mtop_t *mtop, rvec x[],
2359 gmx_molblock_t *molb;
2361 if (bFirst && fplog)
2363 fprintf(fplog, "Removing pbc first time\n");
2368 for (mb = 0; mb < mtop->nmolblock; mb++)
2370 molb = &mtop->molblock[mb];
2371 if (molb->natoms_mol == 1 ||
2372 (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2374 /* Just one atom or charge group in the molecule, no PBC required */
2375 as += molb->nmol*molb->natoms_mol;
2379 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2380 mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2381 0, molb->natoms_mol, FALSE, FALSE, graph);
2383 for (mol = 0; mol < molb->nmol; mol++)
2385 mk_mshift(fplog, graph, ePBC, box, x+as);
2387 shift_self(graph, box, x+as);
2388 /* The molecule is whole now.
2389 * We don't need the second mk_mshift call as in do_pbc_first,
2390 * since we no longer need this graph.
2393 as += molb->natoms_mol;
2401 void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
2402 const gmx_mtop_t *mtop, rvec x[])
2404 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2407 void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
2408 const gmx_mtop_t *mtop, rvec x[])
2410 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2413 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2416 nth = gmx_omp_nthreads_get(emntDefault);
2418 #pragma omp parallel for num_threads(nth) schedule(static)
2419 for (t = 0; t < nth; t++)
2425 offset = (natoms*t )/nth;
2426 len = (natoms*(t + 1))/nth - offset;
2427 put_atoms_in_box(ePBC, box, len, x + offset);
2429 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2433 // TODO This can be cleaned up a lot, and move back to runner.cpp
2434 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2435 const t_inputrec *inputrec,
2436 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2437 gmx_walltime_accounting_t walltime_accounting,
2438 nonbonded_verlet_t *nbv,
2440 gmx_bool bWriteStat)
2442 t_nrnb *nrnb_tot = nullptr;
2444 double nbfs = 0, mflop = 0;
2445 double elapsed_time,
2446 elapsed_time_over_all_ranks,
2447 elapsed_time_over_all_threads,
2448 elapsed_time_over_all_threads_over_all_ranks;
2449 /* Control whether it is valid to print a report. Only the
2450 simulation master may print, but it should not do so if the run
2451 terminated e.g. before a scheduled reset step. This is
2452 complicated by the fact that PME ranks are unaware of the
2453 reason why they were sent a pmerecvqxFINISH. To avoid
2454 communication deadlocks, we always do the communication for the
2455 report, even if we've decided not to write the report, because
2456 how long it takes to finish the run is not important when we've
2457 decided not to report on the simulation performance. */
2458 bool printReport = SIMMASTER(cr);
2460 if (!walltime_accounting_get_valid_finish(walltime_accounting))
2462 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2463 printReport = false;
2470 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2471 cr->mpi_comm_mysim);
2479 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2480 elapsed_time_over_all_ranks = elapsed_time;
2481 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2482 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2486 /* reduce elapsed_time over all MPI ranks in the current simulation */
2487 MPI_Allreduce(&elapsed_time,
2488 &elapsed_time_over_all_ranks,
2489 1, MPI_DOUBLE, MPI_SUM,
2490 cr->mpi_comm_mysim);
2491 elapsed_time_over_all_ranks /= cr->nnodes;
2492 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2493 * current simulation. */
2494 MPI_Allreduce(&elapsed_time_over_all_threads,
2495 &elapsed_time_over_all_threads_over_all_ranks,
2496 1, MPI_DOUBLE, MPI_SUM,
2497 cr->mpi_comm_mysim);
2503 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2510 if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2512 print_dd_statistics(cr, inputrec, fplog);
2515 /* TODO Move the responsibility for any scaling by thread counts
2516 * to the code that handled the thread region, so that there's a
2517 * mechanism to keep cycle counting working during the transition
2518 * to task parallelism. */
2519 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2520 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2521 wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2522 auto cycle_sum(wallcycle_sum(cr, wcycle));
2526 auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2527 gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
2528 if (pme_gpu_task_enabled(pme))
2530 pme_gpu_get_timings(pme, &pme_gpu_timings);
2532 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2533 elapsed_time_over_all_ranks,
2538 if (EI_DYNAMICS(inputrec->eI))
2540 delta_t = inputrec->delta_t;
2545 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2546 elapsed_time_over_all_ranks,
2547 walltime_accounting_get_nsteps_done(walltime_accounting),
2548 delta_t, nbfs, mflop);
2552 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2553 elapsed_time_over_all_ranks,
2554 walltime_accounting_get_nsteps_done(walltime_accounting),
2555 delta_t, nbfs, mflop);
2560 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2562 /* this function works, but could probably use a logic rewrite to keep all the different
2563 types of efep straight. */
2565 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2570 t_lambda *fep = ir->fepvals;
2571 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2572 if checkpoint is set -- a kludge is in for now
2575 for (int i = 0; i < efptNR; i++)
2577 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2578 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2580 lambda[i] = fep->init_lambda;
2583 lam0[i] = lambda[i];
2588 lambda[i] = fep->all_lambda[i][*fep_state];
2591 lam0[i] = lambda[i];
2597 /* need to rescale control temperatures to match current state */
2598 for (int i = 0; i < ir->opts.ngtc; i++)
2600 if (ir->opts.ref_t[i] > 0)
2602 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2607 /* Send to the log the information on the current lambdas */
2608 if (fplog != nullptr)
2610 fprintf(fplog, "Initial vector of lambda components:[ ");
2611 for (const auto &l : lambda)
2613 fprintf(fplog, "%10.4f ", l);
2615 fprintf(fplog, "]\n");
2621 void init_md(FILE *fplog,
2622 t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2623 t_inputrec *ir, const gmx_output_env_t *oenv,
2624 const MdrunOptions &mdrunOptions,
2625 double *t, double *t0,
2626 t_state *globalState, double *lam0,
2627 t_nrnb *nrnb, gmx_mtop_t *mtop,
2629 int nfile, const t_filenm fnm[],
2630 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2631 tensor force_vir, tensor shake_vir, rvec mu_tot,
2632 gmx_bool *bSimAnn, t_vcm **vcm,
2633 gmx_wallcycle_t wcycle)
2637 /* Initial values */
2638 *t = *t0 = ir->init_t;
2641 for (i = 0; i < ir->opts.ngtc; i++)
2643 /* set bSimAnn if any group is being annealed */
2644 if (ir->opts.annealing[i] != eannNO)
2650 /* Initialize lambda variables */
2651 /* TODO: Clean up initialization of fep_state and lambda in t_state.
2652 * We currently need to call initialize_lambdas on non-master ranks
2653 * to initialize lam0.
2657 initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
2662 std::array<real, efptNR> tmpLambda;
2663 initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
2666 // TODO upd is never NULL in practice, but the analysers don't know that
2669 *upd = init_update(ir);
2673 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2678 *vcm = init_vcm(fplog, &mtop->groups, ir);
2681 if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
2683 if (ir->etc == etcBERENDSEN)
2685 please_cite(fplog, "Berendsen84a");
2687 if (ir->etc == etcVRESCALE)
2689 please_cite(fplog, "Bussi2007a");
2691 if (ir->eI == eiSD1)
2693 please_cite(fplog, "Goga2012");
2700 *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
2702 *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
2703 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2706 /* Initiate variables */
2707 clear_mat(force_vir);
2708 clear_mat(shake_vir);